#!/usr/bin/env python3
"""Generate deeper llms.txt adoption analysis from the crawl archive."""

from __future__ import annotations

import csv
import json
import math
import re
from collections import Counter
from pathlib import Path
from statistics import median

import matplotlib.pyplot as plt


BASE = Path(__file__).resolve().parent
DATA = BASE / "data"
CHARTS = BASE / "charts"
RAW_LLMS = BASE / "raw_llms_txt" / "llms_txt"
RAW_FULL = BASE / "raw_llms_txt" / "llms_full_txt"


TOPIC_PATTERNS = {
    "docs_or_documentation": r"\bdocs?|documentation\b",
    "api": r"\bapi\b",
    "pricing": r"\bpricing|price|plans?\b",
    "changelog_or_releases": r"\bchangelog|release notes?|releases?\b",
    "support_or_help": r"\bsupport|help center|help\b",
    "blog_or_guides": r"\bblog|guides?|tutorials?\b",
    "security_or_privacy": r"\bsecurity|privacy|compliance|terms\b",
    "github": r"github\.com",
    "optional_section": r"(?mi)^##\s+optional\b",
}


def safe_name(domain: str) -> str:
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", domain)


def read_rows() -> list[dict[str, str]]:
    with (DATA / "llms_probe_results_top_10000.csv").open(newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))


def read_text(raw_dir: Path, domain: str) -> str:
    path = raw_dir / f"{safe_name(domain)}.txt"
    if not path.exists():
        return ""
    return path.read_text(encoding="utf-8", errors="replace")


def valid_hit(row: dict[str, str], prefix: str) -> bool:
    return row.get(f"{prefix}_status") == "200" and row.get(f"{prefix}_valid") == "1"


def content_metrics(text: str) -> dict[str, int | str]:
    headings = re.findall(r"(?m)^(#{1,6})\s+(.+?)\s*$", text)
    md_links = re.findall(r"\[[^\]]+\]\([^\)]+\)", text)
    urls = re.findall(r"https?://", text)
    topic_hits = {
        name: int(re.search(pattern, text, re.IGNORECASE | re.MULTILINE) is not None)
        for name, pattern in TOPIC_PATTERNS.items()
    }
    first_h1 = ""
    first_h1_match = re.search(r"(?m)^#\s+(.+?)\s*$", text)
    if first_h1_match:
        first_h1 = first_h1_match.group(1).strip()[:160]
    return {
        "chars": len(text),
        "lines": text.count("\n") + 1 if text else 0,
        "h1_count": sum(1 for h, _ in headings if h == "#"),
        "h2_count": sum(1 for h, _ in headings if h == "##"),
        "h3_count": sum(1 for h, _ in headings if h == "###"),
        "section_count": sum(1 for h, _ in headings if len(h) <= 3),
        "markdown_link_count": len(md_links),
        "url_count": len(urls),
        "first_h1": first_h1,
        **topic_hits,
    }


def quality_score(row: dict[str, str], metrics: dict[str, int | str]) -> tuple[int, str, list[str]]:
    bytes_saved = int(row.get("llms_txt_bytes") or 0)
    content_type = (row.get("llms_txt_content_type") or "").split(";")[0].lower()
    sections = int(metrics["section_count"])
    links = int(metrics["markdown_link_count"])
    h1_count = int(metrics["h1_count"])
    issues: list[str] = []
    score = 20

    if content_type in {"text/plain", "text/markdown"}:
        score += 10
    elif content_type:
        score += 4
        issues.append(f"unusual_content_type:{content_type}")
    else:
        issues.append("missing_content_type")

    if 500 <= bytes_saved <= 200_000:
        score += 20
    elif 200_000 < bytes_saved <= 1_000_000:
        score += 14
        issues.append("large_file")
    elif bytes_saved > 1_000_000:
        score += 7
        issues.append("very_large_file")
    elif 1 <= bytes_saved < 500:
        score += 5
        issues.append("thin_file")
    else:
        issues.append("empty_file")

    if sections >= 6:
        score += 20
    elif sections >= 2:
        score += 12
    elif sections == 1:
        score += 6
        issues.append("minimal_structure")
    else:
        issues.append("no_markdown_structure")

    if links >= 11:
        score += 20
    elif links >= 1:
        score += 12
    else:
        issues.append("no_markdown_links")

    topic_count = sum(int(metrics[name]) for name in TOPIC_PATTERNS)
    score += min(10, topic_count * 2)

    if h1_count == 0:
        issues.append("missing_h1")
    if links > 5_000:
        issues.append("link_dump")
        score -= 8
    if bytes_saved > 2_000_000:
        score -= 5
    if bytes_saved < 100:
        score -= 8

    score = max(0, min(100, score))
    if score >= 80:
        tier = "strong_structured_index"
    elif score >= 60:
        tier = "usable_index"
    elif score >= 35:
        tier = "thin_or_irregular"
    else:
        tier = "symbolic_or_low_utility"
    return score, tier, issues


def archetype(row: dict[str, str], metrics: dict[str, int | str], score: int) -> str:
    bytes_saved = int(row.get("llms_txt_bytes") or 0)
    links = int(metrics["markdown_link_count"])
    sections = int(metrics["section_count"])
    if bytes_saved < 500 or score < 35:
        return "symbolic_or_placeholder"
    if bytes_saved > 1_000_000 or links > 5_000:
        return "massive_content_dump"
    if sections >= 6 and links >= 11:
        return "structured_index"
    if links >= 11:
        return "link_catalog"
    if sections >= 6:
        return "sectioned_text"
    return "thin_index"


def percentile(values: list[int], p: float) -> int:
    if not values:
        return 0
    values = sorted(values)
    idx = round((len(values) - 1) * p)
    return values[idx]


def save_csv(path: Path, rows: list[dict[str, object]], fieldnames: list[str]) -> None:
    with path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def bar_chart(path: Path, labels: list[str], values: list[float], title: str, ylabel: str, color: str) -> None:
    plt.figure(figsize=(9, 4.8))
    bars = plt.bar(labels, values, color=color)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xticks(rotation=25, ha="right")
    ymax = max(values + [1])
    for bar, value in zip(bars, values):
        label = f"{value:.1f}%" if ylabel.endswith("(%)") else f"{int(value):,}"
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + ymax * 0.025, label, ha="center", fontsize=9)
    plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()


def horizontal_bar(path: Path, labels: list[str], values: list[float], title: str, xlabel: str, color: str) -> None:
    plt.figure(figsize=(8.6, 5.2))
    y = list(range(len(labels)))
    plt.barh(y, values, color=color)
    plt.yticks(y, labels)
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.xlabel(xlabel)
    xmax = max(values + [1])
    for idx, value in enumerate(values):
        label = f"{value:.1f}%" if xlabel.endswith("(%)") else f"{int(value):,}"
        plt.text(value + xmax * 0.015, idx, label, va="center", fontsize=9)
    plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()


def main() -> None:
    rows = read_rows()
    valid = [r for r in rows if valid_hit(r, "llms_txt")]
    full_valid = [r for r in rows if valid_hit(r, "llms_full_txt")]
    http_200 = [r for r in rows if r["llms_txt_status"] == "200"]
    invalid_200 = [r for r in http_200 if not valid_hit(r, "llms_txt")]

    enriched: list[dict[str, object]] = []
    for row in valid:
        text = read_text(RAW_LLMS, row["domain"])
        metrics = content_metrics(text)
        score, tier, issues = quality_score(row, metrics)
        arch = archetype(row, metrics, score)
        full_text = read_text(RAW_FULL, row["domain"]) if valid_hit(row, "llms_full_txt") else ""
        full_metrics = content_metrics(full_text) if full_text else {}
        full_bytes = int(row.get("llms_full_txt_bytes") or 0)
        llms_bytes = int(row.get("llms_txt_bytes") or 0)
        enriched.append(
            {
                "rank": int(row["rank"]),
                "domain": row["domain"],
                "tld": row["domain"].rsplit(".", 1)[-1],
                "llms_txt_url": row["llms_txt_url"],
                "llms_txt_bytes": llms_bytes,
                "llms_txt_content_type": row["llms_txt_content_type"],
                **metrics,
                "quality_score": score,
                "quality_tier": tier,
                "archetype": arch,
                "quality_issues": "|".join(issues),
                "llms_full_txt_valid": int(valid_hit(row, "llms_full_txt")),
                "llms_full_txt_bytes": full_bytes,
                "full_to_index_size_ratio": round(full_bytes / llms_bytes, 3) if llms_bytes else 0,
                "llms_full_markdown_link_count": full_metrics.get("markdown_link_count", 0),
                "llms_full_section_count": full_metrics.get("section_count", 0),
            }
        )

    fieldnames = [
        "rank",
        "domain",
        "tld",
        "llms_txt_url",
        "llms_txt_bytes",
        "llms_txt_content_type",
        "chars",
        "lines",
        "h1_count",
        "h2_count",
        "h3_count",
        "section_count",
        "markdown_link_count",
        "url_count",
        *TOPIC_PATTERNS.keys(),
        "first_h1",
        "quality_score",
        "quality_tier",
        "archetype",
        "quality_issues",
        "llms_full_txt_valid",
        "llms_full_txt_bytes",
        "full_to_index_size_ratio",
        "llms_full_markdown_link_count",
        "llms_full_section_count",
    ]
    save_csv(DATA / "deep_llms_txt_adopters_top_10000.csv", enriched, fieldnames)

    rank_deciles = []
    for lo in range(1, 10001, 1000):
        hi = lo + 999
        subset = [r for r in rows if lo <= int(r["rank"]) <= hi]
        hits = sum(valid_hit(r, "llms_txt") for r in subset)
        full_hits = sum(valid_hit(r, "llms_full_txt") for r in subset)
        rank_deciles.append(
            {
                "rank_bucket": f"{lo:,}-{hi:,}",
                "domains": len(subset),
                "llms_txt_hits": hits,
                "llms_txt_adoption_pct": round(hits / len(subset) * 100, 2),
                "llms_full_txt_hits": full_hits,
                "llms_full_txt_adoption_pct": round(full_hits / len(subset) * 100, 2),
            }
        )
    save_csv(
        DATA / "deep_rank_decile_adoption_top_10000.csv",
        rank_deciles,
        ["rank_bucket", "domains", "llms_txt_hits", "llms_txt_adoption_pct", "llms_full_txt_hits", "llms_full_txt_adoption_pct"],
    )

    all_tld = Counter(r["domain"].rsplit(".", 1)[-1] for r in rows)
    valid_tld = Counter(r["domain"].rsplit(".", 1)[-1] for r in valid)
    full_tld = Counter(r["domain"].rsplit(".", 1)[-1] for r in full_valid)
    tld_rows = []
    for tld, count in all_tld.items():
        if count >= 50:
            tld_rows.append(
                {
                    "tld": f".{tld}",
                    "domains": count,
                    "llms_txt_hits": valid_tld[tld],
                    "llms_txt_adoption_pct": round(valid_tld[tld] / count * 100, 2),
                    "llms_full_txt_hits": full_tld[tld],
                    "llms_full_txt_adoption_pct": round(full_tld[tld] / count * 100, 2),
                }
            )
    tld_rows.sort(key=lambda x: (float(x["llms_txt_adoption_pct"]), int(x["llms_txt_hits"])), reverse=True)
    save_csv(
        DATA / "deep_tld_adoption_top_10000.csv",
        tld_rows,
        ["tld", "domains", "llms_txt_hits", "llms_txt_adoption_pct", "llms_full_txt_hits", "llms_full_txt_adoption_pct"],
    )

    topic_rows = []
    for name in TOPIC_PATTERNS:
        hits = sum(int(e[name]) for e in enriched)
        topic_rows.append({"topic_signal": name, "hits": hits, "share_of_valid_pct": round(hits / len(enriched) * 100, 2)})
    topic_rows.sort(key=lambda x: int(x["hits"]), reverse=True)
    save_csv(DATA / "deep_topic_signals_top_10000.csv", topic_rows, ["topic_signal", "hits", "share_of_valid_pct"])

    size_values = [int(e["llms_txt_bytes"]) for e in enriched]
    score_values = [int(e["quality_score"]) for e in enriched]
    tier_counts = Counter(str(e["quality_tier"]) for e in enriched)
    archetype_counts = Counter(str(e["archetype"]) for e in enriched)
    issue_counts = Counter()
    for e in enriched:
        issue_counts.update(i for i in str(e["quality_issues"]).split("|") if i)

    dual = [e for e in enriched if int(e["llms_full_txt_valid"]) == 1]
    dual_summary = {
        "dual_file_valid_domains": len(dual),
        "dual_file_share_of_valid_llms_txt_pct": round(len(dual) / len(enriched) * 100, 2),
        "same_or_tiny_full_files": sum(
            1
            for e in dual
            if int(e["llms_full_txt_bytes"]) <= int(e["llms_txt_bytes"]) or int(e["llms_full_txt_bytes"]) < 100
        ),
        "full_larger_than_index": sum(1 for e in dual if int(e["llms_full_txt_bytes"]) > int(e["llms_txt_bytes"])),
        "median_full_to_index_size_ratio": round(median([float(e["full_to_index_size_ratio"]) for e in dual]), 2) if dual else 0,
        "largest_full_files": sorted(
            [
                {
                    "rank": e["rank"],
                    "domain": e["domain"],
                    "llms_txt_bytes": e["llms_txt_bytes"],
                    "llms_full_txt_bytes": e["llms_full_txt_bytes"],
                    "full_to_index_size_ratio": e["full_to_index_size_ratio"],
                }
                for e in dual
            ],
            key=lambda x: int(x["llms_full_txt_bytes"]),
            reverse=True,
        )[:15],
    }

    summary = {
        "sample_size": len(rows),
        "valid_llms_txt": len(valid),
        "valid_llms_full_txt": len(full_valid),
        "http_200_llms_txt": len(http_200),
        "invalid_http_200_llms_txt": len(invalid_200),
        "invalid_http_200_share_pct": round(len(invalid_200) / len(http_200) * 100, 2),
        "raw_200_overestimate_factor": round(len(http_200) / len(valid), 2),
        "rank_deciles": rank_deciles,
        "top_tlds_by_adoption_min_50_domains": tld_rows[:15],
        "topic_signals": topic_rows,
        "quality_tiers": dict(tier_counts.most_common()),
        "archetypes": dict(archetype_counts.most_common()),
        "quality_issues": dict(issue_counts.most_common()),
        "quality_score_summary": {
            "median": round(median(score_values), 1),
            "p10": percentile(score_values, 0.10),
            "p25": percentile(score_values, 0.25),
            "p75": percentile(score_values, 0.75),
            "p90": percentile(score_values, 0.90),
        },
        "size_summary_bytes": {
            "median": round(median(size_values), 1),
            "p75": percentile(size_values, 0.75),
            "p90": percentile(size_values, 0.90),
            "p95": percentile(size_values, 0.95),
            "p99": percentile(size_values, 0.99),
            "max": max(size_values),
        },
        "largest_llms_txt_files": sorted(
            [
                {
                    "rank": e["rank"],
                    "domain": e["domain"],
                    "llms_txt_bytes": e["llms_txt_bytes"],
                    "markdown_link_count": e["markdown_link_count"],
                    "section_count": e["section_count"],
                    "quality_score": e["quality_score"],
                    "archetype": e["archetype"],
                }
                for e in enriched
            ],
            key=lambda x: int(x["llms_txt_bytes"]),
            reverse=True,
        )[:20],
        "top_quality_examples": sorted(
            [
                {
                    "rank": e["rank"],
                    "domain": e["domain"],
                    "llms_txt_bytes": e["llms_txt_bytes"],
                    "markdown_link_count": e["markdown_link_count"],
                    "section_count": e["section_count"],
                    "quality_score": e["quality_score"],
                    "archetype": e["archetype"],
                }
                for e in enriched
            ],
            key=lambda x: (int(x["quality_score"]), -int(x["rank"])),
            reverse=True,
        )[:25],
        "lowest_utility_top_ranked_examples": sorted(
            [
                {
                    "rank": e["rank"],
                    "domain": e["domain"],
                    "llms_txt_bytes": e["llms_txt_bytes"],
                    "markdown_link_count": e["markdown_link_count"],
                    "section_count": e["section_count"],
                    "quality_score": e["quality_score"],
                    "quality_issues": e["quality_issues"],
                }
                for e in enriched
                if int(e["quality_score"]) < 45
            ],
            key=lambda x: int(x["rank"]),
        )[:25],
        "dual_file_summary": dual_summary,
    }
    (DATA / "deep_analysis_top_10000.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

    bar_chart(
        CHARTS / "05_rank_decile_adoption.png",
        [r["rank_bucket"] for r in rank_deciles],
        [float(r["llms_txt_adoption_pct"]) for r in rank_deciles],
        "llms.txt adoption by Tranco rank decile",
        "Adoption rate (%)",
        "#2563eb",
    )
    horizontal_bar(
        CHARTS / "06_tld_adoption_min_50.png",
        [r["tld"] for r in tld_rows[:12]],
        [float(r["llms_txt_adoption_pct"]) for r in tld_rows[:12]],
        "Top TLDs by llms.txt adoption (min. 50 domains)",
        "Adoption rate (%)",
        "#0f766e",
    )
    horizontal_bar(
        CHARTS / "07_topic_signals.png",
        [r["topic_signal"].replace("_", " ") for r in topic_rows],
        [float(r["share_of_valid_pct"]) for r in topic_rows],
        "Common content signals in valid llms.txt files",
        "Share of valid files (%)",
        "#7c3aed",
    )
    horizontal_bar(
        CHARTS / "08_quality_archetypes.png",
        [k.replace("_", " ") for k, _ in archetype_counts.most_common()],
        [v for _, v in archetype_counts.most_common()],
        "Implementation archetypes among valid llms.txt files",
        "Domains",
        "#c2410c",
    )

    print(json.dumps(summary, indent=2))


if __name__ == "__main__":
    main()
