#!/usr/bin/env python3
"""Generate analysis tables and charts from llms.txt crawl output."""

from __future__ import annotations

import csv
import json
import re
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt


BASE = Path(__file__).resolve().parent
DATA = BASE / "data"
CHARTS = BASE / "charts"
RAW = BASE / "raw_llms_txt" / "llms_txt"


def safe_name(domain: str) -> str:
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", domain)


def read_rows(limit: int) -> list[dict[str, str]]:
    with (DATA / f"llms_probe_results_top_{limit}.csv").open(newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))


def is_200(value: str) -> bool:
    return str(value) == "200"


def valid_hit(row: dict[str, str], prefix: str) -> bool:
    return is_200(row[f"{prefix}_status"]) and str(row.get(f"{prefix}_valid", "0")) == "1"


def bucket(rank: int) -> str:
    if rank <= 100:
        return "Top 100"
    if rank <= 1000:
        return "Top 101-1k"
    return "Top 1k-10k"


def content_metrics(domain: str) -> dict[str, int | str]:
    path = RAW / f"{safe_name(domain)}.txt"
    text = path.read_text(encoding="utf-8", errors="replace")
    return {
        "h1_count": len(re.findall(r"(?m)^#\s+", text)),
        "section_count": len(re.findall(r"(?m)^#{1,3}\s+", text)),
        "markdown_link_count": len(re.findall(r"\[[^\]]+\]\([^\)]+\)", text)),
        "url_count": len(re.findall(r"https?://", text)),
        "has_optional_section": int("## optional" in text.lower()),
        "first_h1": (re.search(r"(?m)^#\s+(.+)", text).group(1).strip() if re.search(r"(?m)^#\s+(.+)", text) else ""),
    }


def main() -> None:
    CHARTS.mkdir(exist_ok=True)
    limit_files = sorted(DATA.glob("llms_probe_results_top_*.csv"))
    if not limit_files:
        raise SystemExit("No crawl result CSV found")
    limit = int(limit_files[-1].stem.rsplit("_", 1)[-1])
    rows = read_rows(limit)
    http_200_hits = [r for r in rows if is_200(r["llms_txt_status"])]
    hits = [r for r in rows if valid_hit(r, "llms_txt")]
    full_hits = [r for r in rows if valid_hit(r, "llms_full_txt")]
    both = [r for r in rows if valid_hit(r, "llms_txt") and valid_hit(r, "llms_full_txt")]

    enriched = []
    for row in hits:
        metrics = content_metrics(row["domain"])
        enriched.append({**row, **{k: str(v) for k, v in metrics.items()}})

    with (DATA / f"llms_txt_adopters_enriched_top_{limit}.csv").open("w", newline="", encoding="utf-8") as f:
        if enriched:
            writer = csv.DictWriter(f, fieldnames=list(enriched[0].keys()))
            writer.writeheader()
            writer.writerows(enriched)

    bucket_rows = {}
    for name in ["Top 100", "Top 101-1k", "Top 1k-10k"]:
        subset = [r for r in rows if bucket(int(r["rank"])) == name]
        bucket_rows[name] = subset

    summary = {
        "sample_size": len(rows),
        "llms_txt_http_200": len(http_200_hits),
        "llms_txt_hits": len(hits),
        "llms_txt_adoption_pct": round(len(hits) / len(rows) * 100, 4),
        "llms_full_txt_hits": len(full_hits),
        "llms_full_txt_adoption_pct": round(len(full_hits) / len(rows) * 100, 4),
        "both_files_hits": len(both),
        "both_files_pct": round(len(both) / len(rows) * 100, 4),
        "rank_buckets": {
            name: {
                "domains": len(subset),
                "llms_txt_hits": sum(1 for r in subset if valid_hit(r, "llms_txt")),
                "llms_txt_pct": round(sum(1 for r in subset if valid_hit(r, "llms_txt")) / len(subset) * 100, 4) if subset else 0,
            }
            for name, subset in bucket_rows.items()
        },
        "content_size_buckets": dict(Counter(
            "empty" if int(r["llms_txt_bytes"] or 0) == 0
            else "thin_<500B" if int(r["llms_txt_bytes"] or 0) < 500
            else "index_500B-5KB" if int(r["llms_txt_bytes"] or 0) < 5000
            else "substantial_5KB+"
            for r in hits
        )),
        "top_20_known_adopters": [
            {
                "rank": int(r["rank"]),
                "domain": r["domain"],
                "llms_txt_bytes": int(r["llms_txt_bytes"] or 0),
                "has_llms_full_txt": valid_hit(r, "llms_full_txt"),
            }
            for r in hits[:20]
        ],
    }
    (DATA / f"final_analysis_top_{limit}.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

    plt.figure(figsize=(7, 4))
    labels = ["llms.txt", "llms-full.txt", "both"]
    values = [len(hits), len(full_hits), len(both)]
    plt.bar(labels, values, color=["#2563eb", "#059669", "#7c3aed"])
    plt.title(f"llms.txt adoption in Tranco Top {limit:,}")
    plt.ylabel("Domains returning HTTP 200")
    for i, v in enumerate(values):
        plt.text(i, v + max(values + [1]) * 0.02, str(v), ha="center")
    plt.tight_layout()
    plt.savefig(CHARTS / f"adoption_counts_top_{limit}.png", dpi=180)
    plt.close()

    plt.figure(figsize=(7, 4))
    labels = list(bucket_rows.keys())
    values = [
        sum(1 for r in bucket_rows[name] if valid_hit(r, "llms_txt")) / len(bucket_rows[name]) * 100
        if bucket_rows[name] else 0
        for name in labels
    ]
    plt.bar(labels, values, color="#0f766e")
    plt.title("llms.txt adoption by Tranco rank bucket")
    plt.ylabel("Adoption rate (%)")
    for i, v in enumerate(values):
        plt.text(i, v + max(values + [0.1]) * 0.03, f"{v:.2f}%", ha="center")
    plt.tight_layout()
    plt.savefig(CHARTS / f"rank_bucket_adoption_top_{limit}.png", dpi=180)
    plt.close()

    size_counts = summary["content_size_buckets"]
    plt.figure(figsize=(7, 4))
    labels = list(size_counts.keys())
    values = list(size_counts.values())
    plt.bar(labels, values, color="#9333ea")
    plt.title("llms.txt content size among adopters")
    plt.ylabel("Domains")
    plt.xticks(rotation=15, ha="right")
    for i, v in enumerate(values):
        plt.text(i, v + max(values + [1]) * 0.02, str(v), ha="center")
    plt.tight_layout()
    plt.savefig(CHARTS / f"content_size_distribution_top_{limit}.png", dpi=180)
    plt.close()

    print(json.dumps(summary, indent=2))


if __name__ == "__main__":
    main()
