import argparse import base64 import json import os import re import sys from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Sequence, Set, Tuple import requests ROOT = Path(__file__).resolve().parents[2] TAGS_FILE = ROOT / "source" / "data" / "static" / "tags.json" PLATFORMS_FILE = ROOT / "source" / "data" / "static" / "platforms.json" CATEGORIES_FILE = ROOT / "source" / "data" / "static" / "categories.json" APPLICATIONS_FILE = ROOT / "source" / "data" / "dynamic" / "applications.json" USER_AGENT = "definitive-opensource-contributing-autofill" GITHUB_API_VERSION = "2022-11-28" REPO_PATTERN = re.compile( r"(?:github\.com[:/])?(?P[\w\-.]+)/(?P[\w\-.]+?)(?:\.git)?(?:[#?].*)?$", re.IGNORECASE, ) PLATFORM_KEYWORDS: Dict[str, str] = {} TAG_KEYWORDS: Dict[str, str] = {} CATEGORY_KEYWORDS: Dict[str, str] = {} @dataclass class ReferenceData: tag_ids: Set[str] platform_ids: Set[str] category_ids: Set[str] tag_labels: Dict[str, str] platform_labels: Dict[str, str] category_labels: Dict[str, str] class DuplicateRepositoryError(RuntimeError): """Raised when attempting to append an application that already exists.""" def load_reference_data() -> ReferenceData: with open(TAGS_FILE, "r", encoding="utf-8") as fh: tags_data = json.load(fh) tag_labels: Dict[str, str] = {} for entry in tags_data.get("attributes", []): label = entry.get("description") or entry.get("name") or entry["id"] emoji = entry.get("emoji") if emoji: label = f"{emoji} {label}" tag_labels[entry["id"]] = label for entry in tags_data.get("properties", []): label = entry.get("name") or entry.get("description") or entry["id"] tag_labels[entry["id"]] = label tag_ids = set(tag_labels.keys()) with open(PLATFORMS_FILE, "r", encoding="utf-8") as fh: platforms_data = json.load(fh) platform_labels = { entry["id"]: entry.get("name") or entry["id"] for entry in platforms_data.get("platforms", []) } platform_ids = set(platform_labels.keys()) with open(CATEGORIES_FILE, "r", encoding="utf-8") as fh: categories_data = json.load(fh) category_labels: Dict[str, str] = { entry["id"]: entry.get("name") or entry["id"] for entry in categories_data.get("categories", []) } for entry in categories_data.get("subcategories", []): parent = entry.get("parent") parent_label = category_labels.get(parent, parent) if parent else None name = entry.get("name") or entry["id"] label = f"{name} ({parent_label})" if parent_label else name category_labels[entry["id"]] = label category_ids = set(category_labels.keys()) return ReferenceData( tag_ids=tag_ids, platform_ids=platform_ids, category_ids=category_ids, tag_labels=tag_labels, platform_labels=platform_labels, category_labels=category_labels, ) def render_options(options: Dict[str, str], indent: str = " ") -> str: lines = [] for key, label in sorted(options.items()): descriptor = f"{key}: {label}" if label and label != key else key lines.append(f"{indent}- {descriptor}") return "\n".join(lines) def available_text(label: str, options: Dict[str, str]) -> str: if not options: return "" return f"Available {label} ids:\n{render_options(options)}" def load_applications_data(path: Path) -> Dict: with open(path, "r", encoding="utf-8") as fh: return json.load(fh) def persist_applications_data(path: Path, data: Dict) -> None: with open(path, "w", encoding="utf-8") as fh: json.dump(data, fh, indent=4) fh.write("\n") def append_application(entry: Dict, path: Path) -> None: data = load_applications_data(path) applications = data.setdefault("applications", []) new_url = (entry.get("repo_url") or "").rstrip("/") for existing in applications: if (existing.get("repo_url") or "").rstrip("/") == new_url: try: display_path = path.relative_to(ROOT) except ValueError: display_path = path raise DuplicateRepositoryError( f"Repository {entry['repo_url']} already exists in {display_path}." ) applications.append(entry) persist_applications_data(path, data) def parse_repo_identifier(value: str) -> Tuple[str, str]: value = value.strip() match = REPO_PATTERN.search(value) if match: return match.group("owner"), match.group("repo") if "/" in value: owner, repo = value.split("/", 1) return owner, repo raise ValueError(f"Could not parse repository from '{value}'.") def github_request(path: str, token: Optional[str], params: Optional[Dict[str, str]] = None) -> requests.Response: headers = { "Accept": "application/vnd.github+json", "User-Agent": USER_AGENT, "X-GitHub-Api-Version": GITHUB_API_VERSION, } if token: headers["Authorization"] = f"Bearer {token}" url = f"https://api.github.com/{path.lstrip('/')}" response = requests.get(url, headers=headers, params=params or {}) if response.status_code == 401: raise RuntimeError("GitHub authentication failed. Set the GITHUB_TOKEN environment variable.") if response.status_code == 403 and "rate limit" in response.text.lower(): raise RuntimeError("GitHub rate limit exceeded. Provide a token to continue.") return response def fetch_repo(owner: str, repo: str, token: Optional[str]) -> Dict: response = github_request(f"repos/{owner}/{repo}", token, params={"per_page": 1}) if response.status_code != 200: raise RuntimeError(f"Failed to fetch repo metadata ({response.status_code}): {response.text}") return response.json() def fetch_readme_excerpt(owner: str, repo: str, token: Optional[str]) -> Optional[str]: response = github_request(f"repos/{owner}/{repo}/readme", token) if response.status_code != 200: return None payload = response.json() content = payload.get("content") if not content: return None try: decoded = base64.b64decode(content).decode("utf-8", errors="replace") except (ValueError, UnicodeDecodeError): return None for line in decoded.splitlines(): stripped = line.strip() if not stripped: continue if stripped.startswith("#"): stripped = stripped.lstrip("#").strip() stripped = stripped.rstrip(".") if stripped: return stripped return None def normalize_project_name(repo_name: str) -> str: if not repo_name: return "" if any(ch.isupper() for ch in repo_name if ch.isalpha()): return repo_name tokens = [token for token in re.split(r"[-_]", repo_name) if token] if tokens: return " ".join(token.capitalize() for token in tokens) return repo_name.capitalize() def iso_to_mmddyyyy(value: Optional[str]) -> str: if not value: return "" try: return datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ").strftime("%m/%d/%Y") except ValueError: return "" def keyword_hits(keywords: Dict[str, str], sources: Sequence[str]) -> Set[str]: if not sources: return set() haystack = " ".join(filter(None, sources)).lower() matches: Set[str] = set() for needle, mapped in keywords.items(): if re.search(rf"\b{re.escape(needle)}\b", haystack): matches.add(mapped) return matches def infer_platforms(repo: Dict, ref: ReferenceData) -> Tuple[Set[str], List[str]]: notes: List[str] = [] notes.append( "Platforms were not inferred automatically.\n" f"{available_text('platform', ref.platform_labels)}" ) return set(), notes def infer_tags(repo: Dict, ref: ReferenceData) -> Tuple[Set[str], List[str]]: return set(), [] def infer_category(repo: Dict, ref: ReferenceData) -> Tuple[str, List[str]]: return "", [ "Category must be specified manually.\n" f"{available_text('category', ref.category_labels)}" ] def prompt_list(field_label: str, options: Dict[str, str], allow_empty: bool = False) -> List[str]: valid_values = set(options.keys()) print(f"\n{available_text(field_label, options)}") skip_hint = " (press Enter to skip)" if allow_empty else "" prompt = f"Enter {field_label} ids (comma separated){skip_hint}:\n> " while True: raw = input(prompt).strip() if not raw and allow_empty: return [] values = [val.strip() for val in raw.split(",") if val.strip()] invalid = [val for val in values if val not in valid_values] if invalid: print(f"Invalid values: {', '.join(invalid)}. Please try again.") continue if not values: print("At least one value is required. Press Ctrl+C to abort.") continue return values def prompt_value(field_label: str, options: Dict[str, str]) -> str: valid_values = set(options.keys()) print(f"\n{available_text(field_label, options)}") prompt = f"Enter {field_label} id:\n> " while True: raw = input(prompt).strip() if raw in valid_values: return raw print(f"{raw} is not a valid value. Please try again.") def fill_missing_with_input(entry: Dict, ref: ReferenceData) -> Dict: if not sys.stdin.isatty(): return entry updated = entry.copy() if not updated.get("platforms"): updated["platforms"] = prompt_list("platform", ref.platform_labels) if not updated.get("tags"): updated["tags"] = prompt_list("tag", ref.tag_labels, allow_empty=True) if not updated.get("category"): updated["category"] = prompt_value("category", ref.category_labels) return updated def filter_resolved_notes(notes: List[str], entry: Dict) -> List[str]: filtered: List[str] = [] for note in notes: lowered = note.lower() if "platform" in lowered and entry.get("platforms"): continue if "tag" in lowered and entry.get("tags"): continue if "category" in lowered and entry.get("category"): continue filtered.append(note) return filtered def build_entry( repo_url: str, repo_data: Dict, ref: ReferenceData, owner: str, repo: str, full_details: bool, token: Optional[str], ) -> Tuple[Dict, List[str]]: notes: List[str] = [] flags: Set[str] = set() repo_description = repo_data.get("description") or "" readme_description: Optional[str] = None if full_details and not repo_description: readme_description = fetch_readme_excerpt(owner, repo, token) name = normalize_project_name(repo_data.get("name", "")) platforms, platform_notes = infer_platforms(repo_data, ref) notes.extend(platform_notes) tags, tag_notes = infer_tags(repo_data, ref) notes.extend(tag_notes) category, category_notes = infer_category(repo_data, ref) notes.extend(category_notes) entry = { "name": name, "description": "", "repo_url": repo_url, "tags": sorted(tags), "platforms": sorted(platforms), "category": category, "stars": 0, "flags": sorted(flags), "last_commit": "", "language": "", "license": "", "homepage_url": "", } if full_details: description_value = repo_description or readme_description or "" entry["description"] = description_value if readme_description and not repo_description: flags.add("custom-description") notes.append("Description pulled from README (custom-description flag added).") elif not description_value: notes.append("Repository has no description; field left blank.") entry["stars"] = repo_data.get("stargazers_count", 0) entry["last_commit"] = iso_to_mmddyyyy(repo_data.get("pushed_at")) entry["language"] = repo_data.get("language") or "" license_data = repo_data.get("license") or {} entry["license"] = license_data.get("spdx_id") or license_data.get("name") or "" entry["homepage_url"] = repo_data.get("homepage") or "" entry["flags"] = sorted(flags) invalid_tags = [tag for tag in entry["tags"] if tag not in ref.tag_ids] if invalid_tags: raise ValueError(f"Invalid tag ids supplied: {', '.join(invalid_tags)}") invalid_platforms = [platform for platform in entry["platforms"] if platform not in ref.platform_ids] if invalid_platforms: raise ValueError(f"Invalid platform ids supplied: {', '.join(invalid_platforms)}") if entry["category"] and entry["category"] not in ref.category_ids: raise ValueError(f"Invalid category id supplied: {entry['category']}") if not entry["platforms"]: notes.append( "No platforms detected; please review `platforms`.\n" f"{available_text('platform', ref.platform_labels)}" ) if not entry["category"]: notes.append( "Category missing; update `category` manually.\n" f"{available_text('category', ref.category_labels)}" ) return entry, notes def main() -> None: parser = argparse.ArgumentParser( description="Generate an applications.json entry from a GitHub repository URL." ) parser.add_argument("repo", help="GitHub repository URL or owner/repo slug.") parser.add_argument( "--applications-file", default=str(APPLICATIONS_FILE), help="Path to applications.json (default: source/data/dynamic/applications.json).", ) parser.add_argument( "--full-details", action="store_true", help="Populate optional fields (description, stats, license, homepage) using GitHub data.", ) args = parser.parse_args() owner, repo_name = parse_repo_identifier(args.repo) repo_url = f"https://github.com/{owner}/{repo_name}" token = os.getenv("GITHUB_TOKEN") repo_data = fetch_repo(owner, repo_name, token) ref = load_reference_data() entry, notes = build_entry(repo_url, repo_data, ref, owner, repo_name, args.full_details, token) entry = fill_missing_with_input(entry, ref) notes = filter_resolved_notes(notes, entry) applications_path = Path(args.applications_file).resolve() print(json.dumps(entry, indent=4)) if notes: print("\nNotes:") for note in notes: print(f"- {note}") try: append_application(entry, applications_path) except DuplicateRepositoryError as exc: print(f"\nEntry skipped: {exc}") print("Hint: If you meant to update that entry, edit applications.json directly.") return print(f"\nAdded entry to {applications_path}") if __name__ == "__main__": main()