#!/usr/bin/env python3
"""Detailed early-adopter and implementation-quality analysis."""

from __future__ import annotations

import csv
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from statistics import median

import matplotlib.pyplot as plt


BASE = Path(__file__).resolve().parent
DATA = BASE / "data"
CHARTS = BASE / "charts"
RAW = BASE / "raw_llms_txt" / "llms_txt"


CATEGORY_LABELS = {
    "cloud_dev_infra": "Cloud, dev & infrastructure",
    "security_identity": "Security, identity & privacy",
    "saas_productivity": "SaaS, productivity & customer ops",
    "commerce_payments": "Commerce, payments & retail",
    "marketing_media_adtech": "Marketing, media & adtech",
    "cms_hosting_web": "CMS, hosting & web presence",
    "consumer_hardware": "Consumer, hardware & entertainment",
    "jobs_education_public": "Jobs, education & public information",
    "other": "Other / mixed",
}


MANUAL_CATEGORY = {
    "cloudflare.com": "cloud_dev_infra",
    "azure.com": "cloud_dev_infra",
    "github.com": "cloud_dev_infra",
    "digicert.com": "security_identity",
    "wordpress.org": "cms_hosting_web",
    "adobe.com": "saas_productivity",
    "sentry.io": "cloud_dev_infra",
    "wordpress.com": "cms_hosting_web",
    "kaspersky.com": "security_identity",
    "dropbox.com": "saas_productivity",
    "gravatar.com": "cloud_dev_infra",
    "paypal.com": "commerce_payments",
    "shopify.com": "commerce_payments",
    "rubiconproject.com": "marketing_media_adtech",
    "taboola.com": "marketing_media_adtech",
    "avast.com": "security_identity",
    "weather.com": "jobs_education_public",
    "oxylabs.io": "cloud_dev_infra",
    "sourceforge.net": "cloud_dev_infra",
    "cisco.com": "cloud_dev_infra",
    "stripe.com": "commerce_payments",
    "linktr.ee": "marketing_media_adtech",
    "salesforce.com": "saas_productivity",
    "amplitude.com": "saas_productivity",
    "alidns.com": "cloud_dev_infra",
    "slack.com": "saas_productivity",
    "moe.video": "marketing_media_adtech",
    "dell.com": "consumer_hardware",
    "dailymotion.com": "marketing_media_adtech",
    "plesk.com": "cms_hosting_web",
    "nvidia.com": "consumer_hardware",
    "hcaptcha.com": "security_identity",
    "indeed.com": "jobs_education_public",
    "zendesk.com": "saas_productivity",
    "calendly.com": "saas_productivity",
    "adobe.net": "saas_productivity",
    "wp.com": "cms_hosting_web",
    "paloaltonetworks.com": "security_identity",
    "selectel.ru": "cloud_dev_infra",
    "name.com": "cms_hosting_web",
    "shein.com": "commerce_payments",
    "okta.com": "security_identity",
    "dynatrace.com": "cloud_dev_infra",
    "trendmicro.com": "security_identity",
    "yieldmo.com": "marketing_media_adtech",
    "target.com": "commerce_payments",
    "playstation.com": "consumer_hardware",
    "onetrust.com": "security_identity",
    "braze.com": "marketing_media_adtech",
    "klaviyo.com": "marketing_media_adtech",
    "intercom.io": "saas_productivity",
    "conviva.com": "marketing_media_adtech",
    "dreamhost.com": "cms_hosting_web",
    "optimizely.com": "marketing_media_adtech",
    "mailchimp.com": "marketing_media_adtech",
    "sophos.com": "security_identity",
    "bitrix24.ru": "saas_productivity",
    "cursor.sh": "cloud_dev_infra",
    "prnewswire.com": "marketing_media_adtech",
    "singular.net": "marketing_media_adtech",
    "moloco.com": "marketing_media_adtech",
    "datadoghq.com": "cloud_dev_infra",
    "wps.com": "saas_productivity",
    "qualtrics.com": "saas_productivity",
    "jimdo.com": "cms_hosting_web",
    "cloudinary.com": "cloud_dev_infra",
    "classlink.com": "jobs_education_public",
    "typeform.com": "saas_productivity",
    "agora.io": "cloud_dev_infra",
    "repubblica.it": "marketing_media_adtech",
    "wyzecam.com": "consumer_hardware",
    "kwai.com": "marketing_media_adtech",
    "onesignal.com": "marketing_media_adtech",
    "netgear.com": "consumer_hardware",
}


KEYWORD_CATEGORY_RULES = [
    ("security_identity", r"security|privacy|identity|auth0|okta|captcha|certificate|ssl|trust|compliance|zero trust|endpoint"),
    ("cloud_dev_infra", r"developer|api|cloud|infrastructure|hosting|observability|monitoring|database|kubernetes|github|docs|cdn|server|network"),
    ("commerce_payments", r"payment|checkout|commerce|retail|ecommerce|shop|store|pricing|billing|invoice"),
    ("marketing_media_adtech", r"advertis|marketing|media|video|analytics|campaign|audience|newsletter|social|content"),
    ("saas_productivity", r"crm|customer|productivity|workspace|collaboration|support|meeting|form|survey|workflow|enterprise"),
    ("cms_hosting_web", r"wordpress|cms|website builder|domain|hosting|web presence|blog"),
    ("consumer_hardware", r"hardware|device|gaming|camera|electronics|consumer|storefront"),
    ("jobs_education_public", r"jobs|career|education|school|weather|news|public|government"),
]


def safe_name(domain: str) -> str:
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", domain)


def read_csv(path: Path) -> list[dict[str, str]]:
    with path.open(newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))


def raw_text(domain: str) -> str:
    path = RAW / f"{safe_name(domain)}.txt"
    if not path.exists():
        return ""
    return path.read_text(encoding="utf-8", errors="replace")


def classify(row: dict[str, str], text: str) -> str:
    domain = row["domain"]
    if domain in MANUAL_CATEGORY:
        return MANUAL_CATEGORY[domain]
    haystack = " ".join(
        [
            domain,
            row.get("first_h1", ""),
            text[:20_000],
        ]
    ).lower()
    scores = Counter()
    for category, pattern in KEYWORD_CATEGORY_RULES:
        scores[category] += len(re.findall(pattern, haystack, re.IGNORECASE))
    if not scores:
        return "other"
    category, score = scores.most_common(1)[0]
    return category if score > 0 else "other"


def extract_structure(domain: str, limit: int = 8) -> dict[str, object]:
    text = raw_text(domain)
    headings = [m.group(0).strip() for m in re.finditer(r"(?m)^#{1,3}\s+.+$", text)]
    links = re.findall(r"\[([^\]]+)\]\(([^\)]+)\)", text)
    return {
        "first_headings": " | ".join(headings[:limit]),
        "first_links": " | ".join(f"{label} -> {url}" for label, url in links[:limit]),
    }


def pct(part: int, whole: int) -> float:
    return round(part / whole * 100, 2) if whole else 0.0


def save_csv(path: Path, rows: list[dict[str, object]], fields: list[str]) -> None:
    with path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fields)
        writer.writeheader()
        for row in rows:
            writer.writerow({field: row.get(field, "") for field in fields})


def horizontal_bar(path: Path, rows: list[tuple[str, float]], title: str, xlabel: str, color: str) -> None:
    labels = [r[0] for r in rows]
    values = [r[1] for r in rows]
    plt.figure(figsize=(9, 5.2))
    y = range(len(labels))
    plt.barh(y, values, color=color)
    plt.yticks(y, labels)
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.xlabel(xlabel)
    xmax = max(values + [1])
    for i, value in enumerate(values):
        suffix = "%" if "rate" in xlabel.lower() or "share" in xlabel.lower() else ""
        label = f"{value:.1f}{suffix}" if suffix else f"{int(value):,}"
        plt.text(value + xmax * 0.015, i, label, va="center", fontsize=9)
    plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()


def grouped_bar(path: Path, categories: list[str], a: list[float], b: list[float], title: str, ylabel: str, labels: tuple[str, str]) -> None:
    x = range(len(categories))
    width = 0.36
    plt.figure(figsize=(9.5, 4.8))
    plt.bar([i - width / 2 for i in x], a, width=width, label=labels[0], color="#2563eb")
    plt.bar([i + width / 2 for i in x], b, width=width, label=labels[1], color="#0f766e")
    plt.xticks(list(x), categories, rotation=25, ha="right")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()


def main() -> None:
    adopters = read_csv(DATA / "deep_llms_txt_adopters_top_10000.csv")
    early = [r for r in adopters if int(r["rank"]) <= 1000]
    later = [r for r in adopters if int(r["rank"]) > 1000]

    classified = []
    for row in early:
        text = raw_text(row["domain"])
        category = classify(row, text)
        structure = extract_structure(row["domain"])
        classified.append({**row, "early_category": category, "early_category_label": CATEGORY_LABELS[category], **structure})

    save_csv(
        DATA / "deep_early_adopters_top_1000_classified.csv",
        classified,
        [
            "rank",
            "domain",
            "early_category",
            "early_category_label",
            "llms_txt_bytes",
            "quality_score",
            "quality_tier",
            "archetype",
            "section_count",
            "markdown_link_count",
            "first_h1",
            "first_headings",
            "first_links",
            "llms_full_txt_valid",
            "llms_full_txt_bytes",
        ],
    )

    category_counts = Counter(r["early_category"] for r in classified)
    category_rows = []
    for category, count in category_counts.most_common():
        sub = [r for r in classified if r["early_category"] == category]
        category_rows.append(
            {
                "category": category,
                "category_label": CATEGORY_LABELS[category],
                "domains": count,
                "share_of_top1000_adopters_pct": pct(count, len(classified)),
                "median_quality_score": round(median(int(r["quality_score"]) for r in sub), 1),
                "median_bytes": round(median(int(r["llms_txt_bytes"]) for r in sub), 1),
                "median_markdown_links": round(median(int(r["markdown_link_count"]) for r in sub), 1),
                "strong_structured_index": sum(r["quality_tier"] == "strong_structured_index" for r in sub),
                "structured_index": sum(r["archetype"] == "structured_index" for r in sub),
            }
        )
    save_csv(
        DATA / "deep_early_adopter_categories_top_1000.csv",
        category_rows,
        [
            "category",
            "category_label",
            "domains",
            "share_of_top1000_adopters_pct",
            "median_quality_score",
            "median_bytes",
            "median_markdown_links",
            "strong_structured_index",
            "structured_index",
        ],
    )

    def cohort_stats(name: str, rows: list[dict[str, str]]) -> dict[str, object]:
        return {
            "cohort": name,
            "domains": len(rows),
            "median_quality_score": round(median(int(r["quality_score"]) for r in rows), 1),
            "median_bytes": round(median(int(r["llms_txt_bytes"]) for r in rows), 1),
            "median_markdown_links": round(median(int(r["markdown_link_count"]) for r in rows), 1),
            "median_sections": round(median(int(r["section_count"]) for r in rows), 1),
            "strong_structured_index": sum(r["quality_tier"] == "strong_structured_index" for r in rows),
            "strong_structured_index_pct": pct(sum(r["quality_tier"] == "strong_structured_index" for r in rows), len(rows)),
            "structured_index": sum(r["archetype"] == "structured_index" for r in rows),
            "structured_index_pct": pct(sum(r["archetype"] == "structured_index" for r in rows), len(rows)),
            "symbolic_or_placeholder": sum(r["archetype"] == "symbolic_or_placeholder" for r in rows),
            "symbolic_or_placeholder_pct": pct(sum(r["archetype"] == "symbolic_or_placeholder" for r in rows), len(rows)),
            "massive_content_dump": sum(r["archetype"] == "massive_content_dump" for r in rows),
            "massive_content_dump_pct": pct(sum(r["archetype"] == "massive_content_dump" for r in rows), len(rows)),
        }

    cohort_rows = [cohort_stats("Top 1,000 adopters", early), cohort_stats("Rank 1,001-10,000 adopters", later), cohort_stats("All adopters", adopters)]
    save_csv(
        DATA / "deep_quality_cohort_comparison.csv",
        cohort_rows,
        [
            "cohort",
            "domains",
            "median_quality_score",
            "median_bytes",
            "median_markdown_links",
            "median_sections",
            "strong_structured_index",
            "strong_structured_index_pct",
            "structured_index",
            "structured_index_pct",
            "symbolic_or_placeholder",
            "symbolic_or_placeholder_pct",
            "massive_content_dump",
            "massive_content_dump_pct",
        ],
    )

    archetype_rows = []
    for archetype, count in Counter(r["archetype"] for r in adopters).most_common():
        sub = [r for r in adopters if r["archetype"] == archetype]
        examples = sorted(sub, key=lambda r: int(r["rank"]))[:8]
        archetype_rows.append(
            {
                "archetype": archetype,
                "domains": count,
                "share_of_valid_pct": pct(count, len(adopters)),
                "median_quality_score": round(median(int(r["quality_score"]) for r in sub), 1),
                "median_bytes": round(median(int(r["llms_txt_bytes"]) for r in sub), 1),
                "median_markdown_links": round(median(int(r["markdown_link_count"]) for r in sub), 1),
                "median_sections": round(median(int(r["section_count"]) for r in sub), 1),
                "top_ranked_examples": ", ".join(f"{r['domain']} (#{r['rank']})" for r in examples),
            }
        )
    save_csv(
        DATA / "deep_quality_archetype_details.csv",
        archetype_rows,
        [
            "archetype",
            "domains",
            "share_of_valid_pct",
            "median_quality_score",
            "median_bytes",
            "median_markdown_links",
            "median_sections",
            "top_ranked_examples",
        ],
    )

    # A few hand-readable case studies from raw files.
    case_domains = [
        "cloudflare.com",
        "azure.com",
        "github.com",
        "stripe.com",
        "sentry.io",
        "salesforce.com",
        "sourceforge.net",
        "manageengine.com",
        "alidns.com",
        "wps.com",
    ]
    case_rows = []
    by_domain = {r["domain"]: r for r in adopters}
    for domain in case_domains:
        if domain not in by_domain:
            continue
        row = by_domain[domain]
        structure = extract_structure(domain, limit=10)
        case_rows.append(
            {
                "domain": domain,
                "rank": row["rank"],
                "bytes": row["llms_txt_bytes"],
                "quality_score": row["quality_score"],
                "archetype": row["archetype"],
                "sections": row["section_count"],
                "markdown_links": row["markdown_link_count"],
                "first_h1": row["first_h1"],
                **structure,
            }
        )
    save_csv(
        DATA / "deep_quality_case_studies.csv",
        case_rows,
        ["domain", "rank", "bytes", "quality_score", "archetype", "sections", "markdown_links", "first_h1", "first_headings", "first_links"],
    )

    horizontal_bar(
        CHARTS / "09_early_adopter_categories.png",
        [(CATEGORY_LABELS[r["category"]], float(r["domains"])) for r in category_rows],
        "Top 1,000 llms.txt adopters by category",
        "Domains",
        "#2563eb",
    )
    grouped_bar(
        CHARTS / "10_quality_cohort_comparison.png",
        ["Median score", "Median links", "Median sections"],
        [
            float(cohort_rows[0]["median_quality_score"]),
            float(cohort_rows[0]["median_markdown_links"]),
            float(cohort_rows[0]["median_sections"]),
        ],
        [
            float(cohort_rows[1]["median_quality_score"]),
            float(cohort_rows[1]["median_markdown_links"]),
            float(cohort_rows[1]["median_sections"]),
        ],
        "Top 1,000 adopters have denser llms.txt implementations",
        "Metric value",
        ("Top 1,000", "Rank 1,001-10,000"),
    )
    horizontal_bar(
        CHARTS / "11_quality_archetype_median_links.png",
        [(r["archetype"].replace("_", " "), float(r["median_markdown_links"])) for r in archetype_rows],
        "Median Markdown links by implementation archetype",
        "Median Markdown links",
        "#7c3aed",
    )

    summary = {
        "top1000_adopters": len(early),
        "category_rows": category_rows,
        "cohort_rows": cohort_rows,
        "archetype_rows": archetype_rows,
        "case_studies": case_rows,
        "key_findings": {
            "top1000_structured_index_pct": cohort_rows[0]["structured_index_pct"],
            "later_structured_index_pct": cohort_rows[1]["structured_index_pct"],
            "top1000_strong_structured_index_pct": cohort_rows[0]["strong_structured_index_pct"],
            "later_strong_structured_index_pct": cohort_rows[1]["strong_structured_index_pct"],
            "top1000_median_links": cohort_rows[0]["median_markdown_links"],
            "later_median_links": cohort_rows[1]["median_markdown_links"],
            "top_category": category_rows[0],
        },
    }
    (DATA / "deep_early_quality_analysis.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
    print(json.dumps(summary, indent=2))


if __name__ == "__main__":
    main()
