#!/usr/bin/env python3
"""
Drug Safety Intelligence Report
Datasets used: fda.faers  fda.orangebook  clinicaltrials.studies  clinvar.variants  pubmed.baseline

Cross-dataset value:
  FAERS tells you what adverse events occur in the real world post-approval.
  Orange Book tells you what the FDA approved and which generic alternatives exist.
  ClinicalTrials tells you what is being studied now.
  ClinVar tells you which patients carry genetic variants linked to the condition.
  PubMed tells you what the recent literature says.

  Together they answer: "Is this drug safe for my patient population, what genetic
  risk factors exist, and what is the current research trajectory?"

Usage:
  export MICROQUERY_TOKEN=your_token
  python3 drug_safety_report.py --drug metformin --condition diabetes
  python3 drug_safety_report.py --drug warfarin --condition "atrial fibrillation"
"""

import argparse
import os
import sys
from concurrent.futures import ThreadPoolExecutor
from client import MicroqueryClient, QueryError


def header(title: str):
    print(f"\n{'='*60}")
    print(f"  {title}")
    print(f"{'='*60}")


def section(title: str):
    print(f"\n--- {title} ---")


def run(drug: str, condition: str, mq: MicroqueryClient):
    drug_lower = drug.lower()
    cond_lower = condition.lower()

    header(f"DRUG SAFETY REPORT: {drug.upper()}")

    with ThreadPoolExecutor(max_workers=5) as ex:

        # ------------------------------------------------------------------ #
        # Fire all 5 independent queries in parallel.                         #
        # PubMed (59.8 GB) and ClinicalTrials are the longest; firing them    #
        # in the background while the smaller FAERS/OrangeBook run first.     #
        # ------------------------------------------------------------------ #
        f_pubmed = ex.submit(mq.query, "pubmed", f"""
            SELECT pub_year, COUNT(*) AS publications
            FROM   baseline
            WHERE  MedlineCitation.Article.ArticleTitle ~ '(?i){drug_lower}'
              AND  pub_year >= 2020
            GROUP BY pub_year
            ORDER BY pub_year DESC
            LIMIT 10
        """, verbose=True)

        f_trials = ex.submit(mq.query, "clinicaltrials", f"""
            SELECT nct_id, brief_title, overall_status, phase, enrollment, start_date
            FROM   studies
            WHERE  (brief_title ~ '(?i){drug_lower}' OR conditions ~ '(?i){cond_lower}')
              AND  overall_status IN ('RECRUITING', 'ACTIVE_NOT_RECRUITING', 'ENROLLING_BY_INVITATION')
            ORDER BY start_date DESC
            LIMIT 10
        """, verbose=True)

        f_clinvar = ex.submit(mq.query, "clinvar", f"""
            SELECT gene_symbol,
                   COUNT(*) AS pathogenic_variants,
                   SUM(CASE WHEN n_submitters >= 3 THEN 1 ELSE 0 END) AS well_reviewed
            FROM   variants, UNNEST(phenotypes) AS ph
            WHERE  assembly   = 'GRCh38'
              AND  sig_simple = 1
              AND  ph ~ '(?i){cond_lower}'
            GROUP BY gene_symbol
            ORDER BY well_reviewed DESC, pathogenic_variants DESC
            LIMIT 15
        """, verbose=True)

        # FAERS and OrangeBook are fast (~6 GB combined); run sequentially
        # in the foreground while the large queries are in flight.
        # reactions is an array; group by primary reaction (reactions[0])
        # outcomes codes: DE=death, LT=life-threatening, HO=hospitalization, DS=disability
        faers_rows = mq.query("fda", f"""
            SELECT reactions[0] AS reaction,
                   COUNT(*) AS reports,
                   SUM(CASE WHEN outcomes[0]='DE' OR outcomes[1]='DE' OR outcomes[2]='DE' THEN 1 ELSE 0 END) AS deaths,
                   SUM(CASE WHEN outcomes[0]='HO' OR outcomes[1]='HO' OR outcomes[2]='HO' THEN 1 ELSE 0 END) AS hospitalizations
            FROM   faers
            WHERE  drugname ~ '(?i){drug_lower}'
            GROUP BY reactions[0]
            ORDER BY reports DESC
            LIMIT 15
        """, verbose=True)

        # appl_type: 'N'=NDA (brand), 'A'=ANDA (generic)
        # active_ingredients is an array of {name, strength} objects
        ob_rows = mq.query("fda", f"""
            SELECT application_number, appl_type, brand_name,
                   active_ingredients[0].name AS ingredient,
                   active_ingredients[0].strength AS strength,
                   dosage_form, route, te_code, marketing_status
            FROM   orangebook
            WHERE  active_ingredients[0].name ~ '(?i){drug_lower}'
               OR  brand_name ~ '(?i){drug_lower}'
            ORDER BY appl_type, application_number
            LIMIT 20
        """, verbose=True)

        # Collect background results (likely already done by now)
        trial_rows  = f_trials.result()
        clinvar_rows = f_clinvar.result()
        pubmed_rows = f_pubmed.result()

    # ------------------------------------------------------------------ #
    # Print results in logical order                                       #
    # ------------------------------------------------------------------ #

    # ------------------------------------------------------------------ #
    # 1. FAERS — real-world adverse event profile                         #
    # ------------------------------------------------------------------ #
    section("1. Real-World Adverse Events (FDA FAERS)")
    if faers_rows:
        total_reports = sum(r["reports"] for r in faers_rows)
        print(f"  Top adverse reactions across {total_reports:,} reports (by primary event):")
        for r in faers_rows[:10]:
            hosp_pct = r["hospitalizations"] / r["reports"] * 100 if r["reports"] else 0
            print(
                f"  {str(r['reaction']):<40} {r['reports']:>6} reports  "
                f"{hosp_pct:>5.1f}% hosp"
                + (f"  ⚠ {r['deaths']} deaths" if r["deaths"] else "")
            )
    else:
        print(f"  No FAERS reports found for '{drug}'")

    # ------------------------------------------------------------------ #
    # 2. Orange Book — approval status and generic availability           #
    # ------------------------------------------------------------------ #
    section("2. FDA Approval Status (Orange Book)")
    if ob_rows:
        nda = [r for r in ob_rows if r.get("appl_type") == "N"]
        anda = [r for r in ob_rows if r.get("appl_type") == "A"]
        print(f"  Brand applications (NDA): {len(nda)}")
        print(f"  Generic applications (ANDA): {len(anda)}")
        for r in (nda or ob_rows)[:3]:
            appl_label = "NDA" if r.get("appl_type") == "N" else "ANDA"
            print(
                f"  {appl_label} {r['application_number']}  "
                f"{r.get('ingredient','')}  {r.get('dosage_form','')}  {r.get('route','')}  "
                f"{r.get('strength','')}  {r.get('marketing_status','')}  TE:{r.get('te_code','')}"
            )
    else:
        print(f"  No Orange Book entries found for '{drug}'")

    # ------------------------------------------------------------------ #
    # 3. ClinicalTrials — active research                                 #
    # ------------------------------------------------------------------ #
    section("3. Active Clinical Trials")
    if trial_rows:
        print(f"  {len(trial_rows)} actively recruiting or ongoing trials:")
        for r in trial_rows:
            phase = r.get("phase") or "N/A"
            n = r.get("enrollment") or "?"
            print(f"  {r['nct_id']}  Phase {phase:<3}  n={n:<6}  {r['brief_title'][:60]}")
    else:
        print(f"  No active trials found for '{drug}' / '{condition}'")

    # ------------------------------------------------------------------ #
    # 4. ClinVar — genetic variants linked to the condition               #
    # ------------------------------------------------------------------ #
    section(f"4. Genetic Risk Variants (ClinVar — condition: {condition})")
    if clinvar_rows:
        print(f"  Genes with pathogenic variants linked to '{condition}':")
        for r in clinvar_rows[:10]:
            print(
                f"  {r['gene_symbol']:<12}  {r['pathogenic_variants']:>4} pathogenic "
                f"({r['well_reviewed']} well-reviewed)"
            )
    else:
        print(f"  No ClinVar variants found for '{condition}'")

    # ------------------------------------------------------------------ #
    # 5. PubMed — recent literature pulse                                 #
    # ------------------------------------------------------------------ #
    section("5. Recent Literature (PubMed)")
    if pubmed_rows:
        total_pubs = sum(r["publications"] for r in pubmed_rows)
        print(f"  {total_pubs:,} PubMed publications mentioning '{drug}' since 2020:")
        for r in pubmed_rows:
            bar = "█" * min(r["publications"] // 50, 40)
            print(f"  {r['pub_year']}  {r['publications']:>5}  {bar}")
    else:
        print(f"  No PubMed entries found for '{drug}'")

    # ------------------------------------------------------------------ #
    # Summary                                                              #
    # ------------------------------------------------------------------ #
    header("SUMMARY")
    print(mq.cost_summary())


def main():
    parser = argparse.ArgumentParser(description="Drug safety intelligence report")
    parser.add_argument("--drug", required=True, help="Drug name (e.g. metformin)")
    parser.add_argument("--condition", required=True, help="Condition (e.g. diabetes)")
    parser.add_argument("--token", default=os.environ.get("MICROQUERY_TOKEN"),
                        help="Microquery API token (or set MICROQUERY_TOKEN)")
    args = parser.parse_args()

    if not args.token:
        print("Error: set MICROQUERY_TOKEN or pass --token", file=sys.stderr)
        sys.exit(1)

    mq = MicroqueryClient(api_key=args.token)
    try:
        run(args.drug, args.condition, mq)
    except QueryError as e:
        print(f"Query error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()