mirror of
https://github.com/altstackHQ/altstack-data.git
synced 2026-04-17 18:53:11 +02:00
feat: add scraper.py for nightly GitHub data enrichment
This commit is contained in:
128
scraper/scraper.py
Normal file
128
scraper/scraper.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AltStack Data Scraper — Nightly GitHub Enrichment
|
||||
|
||||
Reads data/tools.json, updates live GitHub metadata for every tool
|
||||
that has a `github_repo` field, and writes the enriched file back.
|
||||
|
||||
Enriched fields:
|
||||
- stars (stargazers_count)
|
||||
- last_commit (pushed_at)
|
||||
- language (language)
|
||||
- license (license.spdx_id)
|
||||
- description (description, only if repo has one)
|
||||
|
||||
Usage:
|
||||
GITHUB_TOKEN=ghp_xxx python scraper/scraper.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
DATA_FILE = os.path.join(os.path.dirname(__file__), "..", "data", "tools.json")
|
||||
API_BASE = "https://api.github.com"
|
||||
|
||||
|
||||
def get_headers():
|
||||
token = os.environ.get("GITHUB_TOKEN", "")
|
||||
headers = {"Accept": "application/vnd.github+json"}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
return headers
|
||||
|
||||
|
||||
def fetch_repo(owner_repo: str):
|
||||
"""Fetch repository metadata from the GitHub API."""
|
||||
url = f"{API_BASE}/repos/{owner_repo}"
|
||||
try:
|
||||
resp = requests.get(url, headers=get_headers(), timeout=15)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
print(f" ⚠ {owner_repo}: HTTP {resp.status_code}", file=sys.stderr)
|
||||
except requests.RequestException as exc:
|
||||
print(f" ⚠ {owner_repo}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def enrich_tool(tool):
|
||||
"""Enrich a single tool dict in-place. Returns True if anything changed."""
|
||||
repo_slug = tool.get("github_repo")
|
||||
if not repo_slug:
|
||||
return False
|
||||
|
||||
data = fetch_repo(repo_slug)
|
||||
if data is None:
|
||||
return False
|
||||
|
||||
changed = False
|
||||
|
||||
# Stars
|
||||
new_stars = data.get("stargazers_count")
|
||||
if new_stars is not None and new_stars != tool.get("stars"):
|
||||
tool["stars"] = new_stars
|
||||
changed = True
|
||||
|
||||
# Last commit (pushed_at)
|
||||
pushed = data.get("pushed_at")
|
||||
if pushed and pushed != tool.get("last_commit"):
|
||||
tool["last_commit"] = pushed
|
||||
changed = True
|
||||
|
||||
# Language
|
||||
lang = data.get("language")
|
||||
if lang and lang != tool.get("language"):
|
||||
tool["language"] = lang
|
||||
changed = True
|
||||
|
||||
# License
|
||||
lic = data.get("license")
|
||||
if lic and isinstance(lic, dict):
|
||||
spdx = lic.get("spdx_id") or lic.get("name")
|
||||
if spdx and spdx != "NOASSERTION" and spdx != tool.get("license"):
|
||||
tool["license"] = spdx
|
||||
changed = True
|
||||
|
||||
# Description — only update if repo has one and tool doesn't already
|
||||
desc = data.get("description")
|
||||
if desc and not tool.get("description"):
|
||||
tool["description"] = desc
|
||||
changed = True
|
||||
|
||||
return changed
|
||||
|
||||
|
||||
def main():
|
||||
# Load
|
||||
with open(DATA_FILE, "r", encoding="utf-8") as f:
|
||||
tools = json.load(f)
|
||||
|
||||
print(f"Loaded {len(tools)} tools from {DATA_FILE}")
|
||||
|
||||
updated = 0
|
||||
for tool in tools:
|
||||
repo = tool.get("github_repo")
|
||||
if not repo:
|
||||
continue
|
||||
print(f" -> {repo}", end="")
|
||||
if enrich_tool(tool):
|
||||
print(" updated")
|
||||
updated += 1
|
||||
else:
|
||||
print(" — no changes")
|
||||
# Respect rate limits
|
||||
time.sleep(0.5)
|
||||
|
||||
# Write back
|
||||
with open(DATA_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(tools, f, indent=2, ensure_ascii=False)
|
||||
f.write("\n")
|
||||
|
||||
print(f"\nDone. {updated}/{len(tools)} tools enriched.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user