#!/usr/bin/env python3
"""
Research-to-Clinic Pipeline
Datasets used: arxiv.papers  pubmed.baseline  clinicaltrials.studies  clinvar.variants

Cross-dataset value:
  arXiv captures the research frontier — preprints appear months before journals,
    so this is where new ideas first surface.
  PubMed is the peer-reviewed record — publication volume tracks how established
    a research area is and whether it's growing or plateauing.
  ClinicalTrials shows whether basic research has crossed into clinical testing —
    the gap between PubMed papers and trial starts reveals translation lag.
  ClinVar links the condition to specific genetic variants, identifying which
    patient subpopulations are most relevant to the research.

  Together they answer: "How mature is the research on topic X, is it moving
  toward clinical application, and which genes/variants are central to it?"

Usage:
  export MICROQUERY_TOKEN=your_token
  python3 research_pipeline.py --topic "CRISPR"
  python3 research_pipeline.py --topic "GLP-1 obesity"
  python3 research_pipeline.py --topic "Alzheimer tau"
"""

import argparse
import os
import sys
from concurrent.futures import ThreadPoolExecutor
from client import MicroqueryClient, QueryError


def header(title: str):
    print(f"\n{'='*60}")
    print(f"  {title}")
    print(f"{'='*60}")


def section(title: str):
    print(f"\n--- {title} ---")


def sparkline(values: list[float], width: int = 20) -> str:
    """Render a simple ASCII sparkline for a sequence of values."""
    if not values or max(values) == 0:
        return "_" * width
    hi = max(values)
    blocks = " ▁▂▃▄▅▆▇█"
    return "".join(blocks[int(v / hi * 8)] for v in values)


def run(topic: str, mq: MicroqueryClient):
    header(f"RESEARCH PIPELINE: {topic.upper()}")

    with ThreadPoolExecutor(max_workers=6) as ex:

        # ------------------------------------------------------------------ #
        # Phase 1: fire all four main queries in parallel.                    #
        # PubMed (84.7 GB) and arXiv are the heaviest; overlapping them with  #
        # the lighter ClinicalTrials and ClinVar cuts total wall time.         #
        # ------------------------------------------------------------------ #
        f_arxiv = ex.submit(mq.query, "arxiv", f"""
            SELECT 1970 + TO_UNIX_EPOCH(submitted) / 31536000 AS yr,
                   COUNT(*)                                    AS papers,
                   COUNT(DISTINCT primary_cat)                 AS categories
            FROM   papers
            WHERE  title    ~ '(?i){topic}'
               OR  abstract ~ '(?i){topic}'
            GROUP BY yr
            ORDER BY yr DESC
            LIMIT 8
        """, verbose=True)

        f_pubmed = ex.submit(mq.query, "pubmed", f"""
            SELECT pub_year, COUNT(*) AS publications
            FROM   baseline
            WHERE  MedlineCitation.Article.ArticleTitle ~ '(?i){topic}'
              AND  pub_year >= 2015
            GROUP BY pub_year
            ORDER BY pub_year DESC
            LIMIT 10
        """, verbose=True)

        f_trials = ex.submit(mq.query, "clinicaltrials", f"""
            SELECT overall_status, phase, COUNT(*) AS trials, AVG(enrollment) AS avg_enrollment
            FROM   studies
            WHERE  conditions ~ '(?i){topic}'
               OR  brief_title ~ '(?i){topic}'
            GROUP BY overall_status, phase
            ORDER BY trials DESC
            LIMIT 20
        """, verbose=True)

        f_genes = ex.submit(mq.query, "clinvar", f"""
            SELECT gene_symbol,
                   COUNT(*) AS pathogenic_variants,
                   SUM(CASE WHEN review_status ~ 'multiple submitters' THEN 1 ELSE 0 END) AS reviewed
            FROM   variants, UNNEST(phenotypes) AS ph
            WHERE  assembly   = 'GRCh38'
              AND  sig_simple = 1
              AND  ph         ~ '(?i){topic}'
            GROUP BY gene_symbol
            ORDER BY pathogenic_variants DESC
            LIMIT 12
        """, verbose=True)

        # Collect phase-1 results
        arxiv_trend = f_arxiv.result()
        pubmed_trend = f_pubmed.result()
        trial_rows   = f_trials.result()
        gene_rows    = f_genes.result()

        # ------------------------------------------------------------------ #
        # Phase 2: conditional follow-up queries (depend on phase-1 results)  #
        # ------------------------------------------------------------------ #
        f_cats = None
        f_recent = None

        if arxiv_trend:
            f_cats = ex.submit(mq.query, "arxiv", f"""
                SELECT primary_cat, COUNT(*) AS papers
                FROM   papers
                WHERE  (title ~ '(?i){topic}' OR abstract ~ '(?i){topic}')
                  AND  submitted >= '2022-01-01'
                GROUP BY primary_cat
                ORDER BY papers DESC
                LIMIT 6
            """, verbose=True)

        if trial_rows:
            f_recent = ex.submit(mq.query, "clinicaltrials", f"""
                SELECT nct_id, brief_title, overall_status, phase, enrollment, start_date
                FROM   studies
                WHERE  (conditions ~ '(?i){topic}' OR brief_title ~ '(?i){topic}')
                  AND  overall_status = 'RECRUITING'
                ORDER BY start_date DESC
                LIMIT 5
            """, verbose=True)

        cat_rows    = f_cats.result()   if f_cats   else []
        recent_trials = f_recent.result() if f_recent else []

    # ------------------------------------------------------------------ #
    # Print results in logical order                                       #
    # ------------------------------------------------------------------ #

    # ------------------------------------------------------------------ #
    # 1. arXiv — frontier preprint activity                               #
    # ------------------------------------------------------------------ #
    section("1. arXiv Preprint Frontier")
    if arxiv_trend:
        total_arxiv = sum(r["papers"] for r in arxiv_trend)
        print(f"  {total_arxiv:,} arXiv preprints mentioning '{topic}':")
        vals = [r["papers"] for r in reversed(arxiv_trend)]
        print(f"  Trend (oldest→newest): {sparkline(vals)}")
        for r in arxiv_trend[:5]:
            yr = str(r["yr"])[:4] if r["yr"] else "?"
            cats = r.get("categories", 0)
            print(f"  {yr}  {r['papers']:>5} papers  ({cats} categories)")

        if cat_rows:
            print(f"  Top categories (2022+): " +
                  ", ".join(f"{r['primary_cat']}({r['papers']})" for r in cat_rows))
    else:
        print(f"  No arXiv preprints found for '{topic}'")

    # ------------------------------------------------------------------ #
    # 2. PubMed — peer-reviewed publication volume                       #
    # ------------------------------------------------------------------ #
    section("2. PubMed Peer-Reviewed Literature")
    if pubmed_trend:
        total_pubmed = sum(r["publications"] for r in pubmed_trend)
        print(f"  {total_pubmed:,} PubMed publications since 2015:")
        vals = [r["publications"] for r in reversed(pubmed_trend)]
        print(f"  Trend (2015→now): {sparkline(vals)}")

        for r in pubmed_trend[:5]:
            bar = "▓" * min(r["publications"] // 20, 30)
            print(f"  {r['pub_year']}  {r['publications']:>5}  {bar}")

        if arxiv_trend and pubmed_trend:
            recent_arxiv  = next((r["papers"] for r in arxiv_trend if str(r["yr"])[:4] == "2024"), 0)
            recent_pubmed = next((r["publications"] for r in pubmed_trend if r["pub_year"] == 2024), 0)
            if recent_pubmed > 0:
                ratio = recent_arxiv / recent_pubmed
                print(f"\n  arXiv/PubMed ratio (2024): {ratio:.1f}x "
                      f"— {'preprints heavily outpacing journals (fast-moving field)' if ratio > 2 else 'preprints and journals roughly in sync'}")
    else:
        print(f"  No PubMed publications found for '{topic}'")

    # ------------------------------------------------------------------ #
    # 3. ClinicalTrials — translation into clinical testing              #
    # ------------------------------------------------------------------ #
    section("3. Clinical Translation (ClinicalTrials.gov)")
    if trial_rows:
        total_trials = sum(r["trials"] for r in trial_rows)
        print(f"  {total_trials:,} clinical trials for '{topic}':")

        active = sum(r["trials"] for r in trial_rows
                     if r.get("overall_status") in ("RECRUITING", "ACTIVE_NOT_RECRUITING",
                                                     "ENROLLING_BY_INVITATION"))
        completed = sum(r["trials"] for r in trial_rows
                        if r.get("overall_status") == "COMPLETED")
        phase3 = sum(r["trials"] for r in trial_rows
                     if (r.get("phase") or "").startswith("PHASE3"))

        print(f"  Active/recruiting: {active}  |  Completed: {completed}  |  Phase 3+: {phase3}")

        for r in trial_rows[:8]:
            avg_n = f"{r['avg_enrollment']:.0f}" if r.get("avg_enrollment") else "?"
            print(
                f"  {r.get('overall_status','?'):<32} "
                f"Phase {r.get('phase','?'):<8} "
                f"{r['trials']:>4} trials  avg n={avg_n}"
            )

        if recent_trials:
            print(f"\n  Most recently opened recruiting trials:")
            for r in recent_trials:
                print(f"  {r['nct_id']}  Phase {r.get('phase','?'):<8} "
                      f"n={r.get('enrollment','?'):<6} {r['brief_title'][:50]}")
    else:
        print(f"  No clinical trials found for '{topic}'")

    # ------------------------------------------------------------------ #
    # 4. ClinVar — genetic architecture of the condition                 #
    # ------------------------------------------------------------------ #
    section("4. Genetic Variants (ClinVar)")
    if gene_rows:
        total_variants = sum(r["pathogenic_variants"] for r in gene_rows)
        print(f"  {total_variants:,} pathogenic variants (GRCh38) linked to '{topic}':")
        for r in gene_rows[:10]:
            reviewed_pct = r["reviewed"] / r["pathogenic_variants"] * 100
            bar = "▓" * min(r["pathogenic_variants"] // 10, 25)
            print(
                f"  {r['gene_symbol']:<12} {r['pathogenic_variants']:>4} variants  "
                f"{reviewed_pct:>4.0f}% multi-reviewed  {bar}"
            )
    else:
        print(f"  No ClinVar pathogenic variants found for '{topic}'")

    # ------------------------------------------------------------------ #
    # Synthesis                                                           #
    # ------------------------------------------------------------------ #
    header("SYNTHESIS")
    if arxiv_trend and pubmed_trend and trial_rows and gene_rows:
        recent_arxiv  = next((r["papers"] for r in arxiv_trend if str(r["yr"])[:4] >= "2023"), 0)
        recent_pubmed = next((r["publications"] for r in pubmed_trend if r["pub_year"] >= 2023), 0)
        total_trials  = sum(r["trials"] for r in trial_rows)
        top_gene      = gene_rows[0]["gene_symbol"] if gene_rows else "—"

        maturity = "early-stage" if total_trials < 20 else ("growing" if total_trials < 100 else "mature")
        momentum = "accelerating" if (recent_arxiv or 0) > 100 else "steady"

        print(f"  Research area:  {maturity}, {momentum}")
        print(f"  Publication:    {recent_arxiv} arXiv (2023+)  /  {recent_pubmed} PubMed (2023+)")
        print(f"  Clinical:       {total_trials} total trials")
        print(f"  Key gene:       {top_gene} ({gene_rows[0]['pathogenic_variants']} pathogenic variants)")

    print(f"\n{mq.cost_summary()}")


def main():
    parser = argparse.ArgumentParser(description="Research-to-clinic pipeline report")
    parser.add_argument("--topic", required=True,
                        help="Research topic / condition (e.g. 'CRISPR', 'GLP-1 obesity')")
    parser.add_argument("--token", default=os.environ.get("MICROQUERY_TOKEN"),
                        help="Microquery API token (or set MICROQUERY_TOKEN)")
    args = parser.parse_args()

    if not args.token:
        print("Error: set MICROQUERY_TOKEN or pass --token", file=sys.stderr)
        sys.exit(1)

    mq = MicroqueryClient(api_key=args.token)
    try:
        run(args.topic, mq)
    except QueryError as e:
        print(f"Query error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()