simplify autofill workflow and README instructions

This commit is contained in:
Vedant Mukherjee
2025-11-15 13:52:19 -05:00
parent 5959c24e92
commit 5e8a0a61bb
5 changed files with 486 additions and 2 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
.idea
.vscode
venv/

View File

@@ -53,6 +53,40 @@ This list aims to serve as a single centralized location for the best of open so
This list aims to stand in the middle ground between human input and automation. Mostly automated websites exist for finding open source projects, but statistics alone fails to encompass the complete picture. This list has scripts to automate markdown formatting, updating stats, and finding potentially abandoned projects. However, the actual processes of choosing which projects make it onto the list, which ones should be removed, and what tags to assign are controlled entirely by humans.
</details>
## Adding New Apps Quickly
Use the `scripts/utils/contributing_autofill.py` helper whenever you want to add a new repository to `applications.json`.
- Run `python3 scripts/utils/contributing_autofill.py <repo-url>`; the script fetches GitHub metadata, asks you for any missing info (platforms, tags, category), and then appends the finished JSON object to `source/data/dynamic/applications.json`.
- If you export a `GITHUB_TOKEN` in your shell, the script will automatically use it to avoid GitHub rate limits.
- Pass `--full-details` if youd like the helper to also populate description, stars, languages, license, homepage, and last commit—otherwise those fields stay blank for the nightly stats updater.
- Duplicate repositories are detected automatically—the script will skip the entry and tell you if that repo already exists so you can update it manually instead.
- After it runs, double-check that the entry has `name`, `repo_url`, `tags`, `platforms`, and `category` filled in—per `CONTRIBUTING.md`, every PR must include those fields before relying on the nightly stats updater.
Example (script output for `https://github.com/mozilla/pdf.js`, required fields only):
```bash
python3 scripts/utils/contributing_autofill.py https://github.com/mozilla/pdf.js
```
```json
{
"name": "Pdf.js",
"description": "",
"repo_url": "https://github.com/mozilla/pdf.js",
"tags": [],
"platforms": [
"cross"
],
"category": "window-management",
"stars": 0,
"flags": [],
"last_commit": "",
"language": "",
"license": "",
"homepage_url": ""
}
```
The command also prints notes (not shown) reminding you to fill in the missing platforms/tags/category before the PR.
## Project Status
```css
Active - Active Development
@@ -1783,4 +1817,3 @@ This project is released under the `MIT license`, hereby granting anyone to use,
</tbody>
</table>
</p>

1
requirements.txt Normal file
View File

@@ -0,0 +1 @@
Requests==2.32.5

View File

@@ -0,0 +1,433 @@
import argparse
import base64
import json
import os
import re
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Set, Tuple
import requests
ROOT = Path(__file__).resolve().parents[2]
TAGS_FILE = ROOT / "source" / "data" / "static" / "tags.json"
PLATFORMS_FILE = ROOT / "source" / "data" / "static" / "platforms.json"
CATEGORIES_FILE = ROOT / "source" / "data" / "static" / "categories.json"
APPLICATIONS_FILE = ROOT / "source" / "data" / "dynamic" / "applications.json"
USER_AGENT = "definitive-opensource-contributing-autofill"
GITHUB_API_VERSION = "2022-11-28"
REPO_PATTERN = re.compile(
r"(?:github\.com[:/])?(?P<owner>[\w\-.]+)/(?P<repo>[\w\-.]+?)(?:\.git)?(?:[#?].*)?$",
re.IGNORECASE,
)
PLATFORM_KEYWORDS: Dict[str, str] = {}
TAG_KEYWORDS: Dict[str, str] = {}
CATEGORY_KEYWORDS: Dict[str, str] = {}
@dataclass
class ReferenceData:
tag_ids: Set[str]
platform_ids: Set[str]
category_ids: Set[str]
tag_labels: Dict[str, str]
platform_labels: Dict[str, str]
category_labels: Dict[str, str]
class DuplicateRepositoryError(RuntimeError):
"""Raised when attempting to append an application that already exists."""
def load_reference_data() -> ReferenceData:
with open(TAGS_FILE, "r", encoding="utf-8") as fh:
tags_data = json.load(fh)
tag_labels: Dict[str, str] = {}
for entry in tags_data.get("attributes", []):
label = entry.get("description") or entry.get("name") or entry["id"]
emoji = entry.get("emoji")
if emoji:
label = f"{emoji} {label}"
tag_labels[entry["id"]] = label
for entry in tags_data.get("properties", []):
label = entry.get("name") or entry.get("description") or entry["id"]
tag_labels[entry["id"]] = label
tag_ids = set(tag_labels.keys())
with open(PLATFORMS_FILE, "r", encoding="utf-8") as fh:
platforms_data = json.load(fh)
platform_labels = {
entry["id"]: entry.get("name") or entry["id"] for entry in platforms_data.get("platforms", [])
}
platform_ids = set(platform_labels.keys())
with open(CATEGORIES_FILE, "r", encoding="utf-8") as fh:
categories_data = json.load(fh)
category_labels: Dict[str, str] = {
entry["id"]: entry.get("name") or entry["id"] for entry in categories_data.get("categories", [])
}
for entry in categories_data.get("subcategories", []):
parent = entry.get("parent")
parent_label = category_labels.get(parent, parent) if parent else None
name = entry.get("name") or entry["id"]
label = f"{name} ({parent_label})" if parent_label else name
category_labels[entry["id"]] = label
category_ids = set(category_labels.keys())
return ReferenceData(
tag_ids=tag_ids,
platform_ids=platform_ids,
category_ids=category_ids,
tag_labels=tag_labels,
platform_labels=platform_labels,
category_labels=category_labels,
)
def render_options(options: Dict[str, str], indent: str = " ") -> str:
lines = []
for key, label in sorted(options.items()):
descriptor = f"{key}: {label}" if label and label != key else key
lines.append(f"{indent}- {descriptor}")
return "\n".join(lines)
def available_text(label: str, options: Dict[str, str]) -> str:
if not options:
return ""
return f"Available {label} ids:\n{render_options(options)}"
def load_applications_data(path: Path) -> Dict:
with open(path, "r", encoding="utf-8") as fh:
return json.load(fh)
def persist_applications_data(path: Path, data: Dict) -> None:
with open(path, "w", encoding="utf-8") as fh:
json.dump(data, fh, indent=4)
fh.write("\n")
def append_application(entry: Dict, path: Path) -> None:
data = load_applications_data(path)
applications = data.setdefault("applications", [])
new_url = (entry.get("repo_url") or "").rstrip("/")
for existing in applications:
if (existing.get("repo_url") or "").rstrip("/") == new_url:
try:
display_path = path.relative_to(ROOT)
except ValueError:
display_path = path
raise DuplicateRepositoryError(
f"Repository {entry['repo_url']} already exists in {display_path}."
)
applications.append(entry)
persist_applications_data(path, data)
def parse_repo_identifier(value: str) -> Tuple[str, str]:
value = value.strip()
match = REPO_PATTERN.search(value)
if match:
return match.group("owner"), match.group("repo")
if "/" in value:
owner, repo = value.split("/", 1)
return owner, repo
raise ValueError(f"Could not parse repository from '{value}'.")
def github_request(path: str, token: Optional[str], params: Optional[Dict[str, str]] = None) -> requests.Response:
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": USER_AGENT,
"X-GitHub-Api-Version": GITHUB_API_VERSION,
}
if token:
headers["Authorization"] = f"Bearer {token}"
url = f"https://api.github.com/{path.lstrip('/')}"
response = requests.get(url, headers=headers, params=params or {})
if response.status_code == 401:
raise RuntimeError("GitHub authentication failed. Set the GITHUB_TOKEN environment variable.")
if response.status_code == 403 and "rate limit" in response.text.lower():
raise RuntimeError("GitHub rate limit exceeded. Provide a token to continue.")
return response
def fetch_repo(owner: str, repo: str, token: Optional[str]) -> Dict:
response = github_request(f"repos/{owner}/{repo}", token, params={"per_page": 1})
if response.status_code != 200:
raise RuntimeError(f"Failed to fetch repo metadata ({response.status_code}): {response.text}")
return response.json()
def fetch_readme_excerpt(owner: str, repo: str, token: Optional[str]) -> Optional[str]:
response = github_request(f"repos/{owner}/{repo}/readme", token)
if response.status_code != 200:
return None
payload = response.json()
content = payload.get("content")
if not content:
return None
try:
decoded = base64.b64decode(content).decode("utf-8", errors="replace")
except (ValueError, UnicodeDecodeError):
return None
for line in decoded.splitlines():
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("#"):
stripped = stripped.lstrip("#").strip()
stripped = stripped.rstrip(".")
if stripped:
return stripped
return None
def normalize_project_name(repo_name: str) -> str:
if not repo_name:
return ""
if any(ch.isupper() for ch in repo_name if ch.isalpha()):
return repo_name
tokens = [token for token in re.split(r"[-_]", repo_name) if token]
if tokens:
return " ".join(token.capitalize() for token in tokens)
return repo_name.capitalize()
def iso_to_mmddyyyy(value: Optional[str]) -> str:
if not value:
return ""
try:
return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ").strftime("%m/%d/%Y")
except ValueError:
return ""
def keyword_hits(keywords: Dict[str, str], sources: Sequence[str]) -> Set[str]:
if not sources:
return set()
haystack = " ".join(filter(None, sources)).lower()
matches: Set[str] = set()
for needle, mapped in keywords.items():
if re.search(rf"\b{re.escape(needle)}\b", haystack):
matches.add(mapped)
return matches
def infer_platforms(repo: Dict, ref: ReferenceData) -> Tuple[Set[str], List[str]]:
notes: List[str] = []
notes.append(
"Platforms were not inferred automatically.\n"
f"{available_text('platform', ref.platform_labels)}"
)
return set(), notes
def infer_tags(repo: Dict, ref: ReferenceData) -> Tuple[Set[str], List[str]]:
return set(), []
def infer_category(repo: Dict, ref: ReferenceData) -> Tuple[str, List[str]]:
return "", [
"Category must be specified manually.\n"
f"{available_text('category', ref.category_labels)}"
]
def prompt_list(field_label: str, options: Dict[str, str], allow_empty: bool = False) -> List[str]:
valid_values = set(options.keys())
print(f"\n{available_text(field_label, options)}")
skip_hint = " (press Enter to skip)" if allow_empty else ""
prompt = f"Enter {field_label} ids (comma separated){skip_hint}:\n> "
while True:
raw = input(prompt).strip()
if not raw and allow_empty:
return []
values = [val.strip() for val in raw.split(",") if val.strip()]
invalid = [val for val in values if val not in valid_values]
if invalid:
print(f"Invalid values: {', '.join(invalid)}. Please try again.")
continue
if not values:
print("At least one value is required. Press Ctrl+C to abort.")
continue
return values
def prompt_value(field_label: str, options: Dict[str, str]) -> str:
valid_values = set(options.keys())
print(f"\n{available_text(field_label, options)}")
prompt = f"Enter {field_label} id:\n> "
while True:
raw = input(prompt).strip()
if raw in valid_values:
return raw
print(f"{raw} is not a valid value. Please try again.")
def fill_missing_with_input(entry: Dict, ref: ReferenceData) -> Dict:
if not sys.stdin.isatty():
return entry
updated = entry.copy()
if not updated.get("platforms"):
updated["platforms"] = prompt_list("platform", ref.platform_labels)
if not updated.get("tags"):
updated["tags"] = prompt_list("tag", ref.tag_labels, allow_empty=True)
if not updated.get("category"):
updated["category"] = prompt_value("category", ref.category_labels)
return updated
def filter_resolved_notes(notes: List[str], entry: Dict) -> List[str]:
filtered: List[str] = []
for note in notes:
lowered = note.lower()
if "platform" in lowered and entry.get("platforms"):
continue
if "tag" in lowered and entry.get("tags"):
continue
if "category" in lowered and entry.get("category"):
continue
filtered.append(note)
return filtered
def build_entry(
repo_url: str,
repo_data: Dict,
ref: ReferenceData,
owner: str,
repo: str,
full_details: bool,
token: Optional[str],
) -> Tuple[Dict, List[str]]:
notes: List[str] = []
flags: Set[str] = set()
repo_description = repo_data.get("description") or ""
readme_description: Optional[str] = None
if full_details and not repo_description:
readme_description = fetch_readme_excerpt(owner, repo, token)
name = normalize_project_name(repo_data.get("name", ""))
platforms, platform_notes = infer_platforms(repo_data, ref)
notes.extend(platform_notes)
tags, tag_notes = infer_tags(repo_data, ref)
notes.extend(tag_notes)
category, category_notes = infer_category(repo_data, ref)
notes.extend(category_notes)
entry = {
"name": name,
"description": "",
"repo_url": repo_url,
"tags": sorted(tags),
"platforms": sorted(platforms),
"category": category,
"stars": 0,
"flags": sorted(flags),
"last_commit": "",
"language": "",
"license": "",
"homepage_url": "",
}
if full_details:
description_value = repo_description or readme_description or ""
entry["description"] = description_value
if readme_description and not repo_description:
flags.add("custom-description")
notes.append("Description pulled from README (custom-description flag added).")
elif not description_value:
notes.append("Repository has no description; field left blank.")
entry["stars"] = repo_data.get("stargazers_count", 0)
entry["last_commit"] = iso_to_mmddyyyy(repo_data.get("pushed_at"))
entry["language"] = repo_data.get("language") or ""
license_data = repo_data.get("license") or {}
entry["license"] = license_data.get("spdx_id") or license_data.get("name") or ""
entry["homepage_url"] = repo_data.get("homepage") or ""
entry["flags"] = sorted(flags)
invalid_tags = [tag for tag in entry["tags"] if tag not in ref.tag_ids]
if invalid_tags:
raise ValueError(f"Invalid tag ids supplied: {', '.join(invalid_tags)}")
invalid_platforms = [platform for platform in entry["platforms"] if platform not in ref.platform_ids]
if invalid_platforms:
raise ValueError(f"Invalid platform ids supplied: {', '.join(invalid_platforms)}")
if entry["category"] and entry["category"] not in ref.category_ids:
raise ValueError(f"Invalid category id supplied: {entry['category']}")
if not entry["platforms"]:
notes.append(
"No platforms detected; please review `platforms`.\n"
f"{available_text('platform', ref.platform_labels)}"
)
if not entry["category"]:
notes.append(
"Category missing; update `category` manually.\n"
f"{available_text('category', ref.category_labels)}"
)
return entry, notes
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate an applications.json entry from a GitHub repository URL."
)
parser.add_argument("repo", help="GitHub repository URL or owner/repo slug.")
parser.add_argument(
"--applications-file",
default=str(APPLICATIONS_FILE),
help="Path to applications.json (default: source/data/dynamic/applications.json).",
)
parser.add_argument(
"--full-details",
action="store_true",
help="Populate optional fields (description, stats, license, homepage) using GitHub data.",
)
args = parser.parse_args()
owner, repo_name = parse_repo_identifier(args.repo)
repo_url = f"https://github.com/{owner}/{repo_name}"
token = os.getenv("GITHUB_TOKEN")
repo_data = fetch_repo(owner, repo_name, token)
ref = load_reference_data()
entry, notes = build_entry(repo_url, repo_data, ref, owner, repo_name, args.full_details, token)
entry = fill_missing_with_input(entry, ref)
notes = filter_resolved_notes(notes, entry)
applications_path = Path(args.applications_file).resolve()
print(json.dumps(entry, indent=4))
if notes:
print("\nNotes:")
for note in notes:
print(f"- {note}")
try:
append_application(entry, applications_path)
except DuplicateRepositoryError as exc:
print(f"\nEntry skipped: {exc}")
print("Hint: If you meant to update that entry, edit applications.json directly.")
return
print(f"\nAdded entry to {applications_path}")
if __name__ == "__main__":
main()

View File

@@ -11707,6 +11707,22 @@
"language": "Python",
"license": "NOASSERTION",
"homepage_url": "https://www.deluge-torrent.org"
},
{
"name": "Pdf.js",
"description": "",
"repo_url": "https://github.com/mozilla/pdf.js",
"tags": [],
"platforms": [
"cross"
],
"category": "window-management",
"stars": 0,
"flags": [],
"last_commit": "",
"language": "",
"license": "",
"homepage_url": ""
}
]
}
}