#!/usr/bin/env python3 """ Research-to-Clinic Pipeline Datasets used: arxiv.papers pubmed.baseline clinicaltrials.studies clinvar.variants Cross-dataset value: arXiv captures the research frontier — preprints appear months before journals, so this is where new ideas first surface. PubMed is the peer-reviewed record — publication volume tracks how established a research area is and whether it's growing or plateauing. ClinicalTrials shows whether basic research has crossed into clinical testing — the gap between PubMed papers and trial starts reveals translation lag. ClinVar links the condition to specific genetic variants, identifying which patient subpopulations are most relevant to the research. Together they answer: "How mature is the research on topic X, is it moving toward clinical application, and which genes/variants are central to it?" Usage: export MICROQUERY_TOKEN=your_token python3 research_pipeline.py --topic "CRISPR" python3 research_pipeline.py --topic "GLP-1 obesity" python3 research_pipeline.py --topic "Alzheimer tau" """ import argparse import os import sys from concurrent.futures import ThreadPoolExecutor from client import MicroqueryClient, QueryError def header(title: str): print(f"\n{'='*60}") print(f" {title}") print(f"{'='*60}") def section(title: str): print(f"\n--- {title} ---") def sparkline(values: list[float], width: int = 20) -> str: """Render a simple ASCII sparkline for a sequence of values.""" if not values or max(values) == 0: return "_" * width hi = max(values) blocks = " ▁▂▃▄▅▆▇█" return "".join(blocks[int(v / hi * 8)] for v in values) def run(topic: str, mq: MicroqueryClient): header(f"RESEARCH PIPELINE: {topic.upper()}") with ThreadPoolExecutor(max_workers=6) as ex: # ------------------------------------------------------------------ # # Phase 1: fire all four main queries in parallel. # # PubMed (84.7 GB) and arXiv are the heaviest; overlapping them with # # the lighter ClinicalTrials and ClinVar cuts total wall time. # # ------------------------------------------------------------------ # f_arxiv = ex.submit(mq.query, "arxiv", f""" SELECT 1970 + TO_UNIX_EPOCH(submitted) / 31536000 AS yr, COUNT(*) AS papers, COUNT(DISTINCT primary_cat) AS categories FROM papers WHERE title ~ '(?i){topic}' OR abstract ~ '(?i){topic}' GROUP BY yr ORDER BY yr DESC LIMIT 8 """, verbose=True) f_pubmed = ex.submit(mq.query, "pubmed", f""" SELECT pub_year, COUNT(*) AS publications FROM baseline WHERE MedlineCitation.Article.ArticleTitle ~ '(?i){topic}' AND pub_year >= 2015 GROUP BY pub_year ORDER BY pub_year DESC LIMIT 10 """, verbose=True) f_trials = ex.submit(mq.query, "clinicaltrials", f""" SELECT overall_status, phase, COUNT(*) AS trials, AVG(enrollment) AS avg_enrollment FROM studies WHERE conditions ~ '(?i){topic}' OR brief_title ~ '(?i){topic}' GROUP BY overall_status, phase ORDER BY trials DESC LIMIT 20 """, verbose=True) f_genes = ex.submit(mq.query, "clinvar", f""" SELECT gene_symbol, COUNT(*) AS pathogenic_variants, SUM(CASE WHEN review_status ~ 'multiple submitters' THEN 1 ELSE 0 END) AS reviewed FROM variants, UNNEST(phenotypes) AS ph WHERE assembly = 'GRCh38' AND sig_simple = 1 AND ph ~ '(?i){topic}' GROUP BY gene_symbol ORDER BY pathogenic_variants DESC LIMIT 12 """, verbose=True) # Collect phase-1 results arxiv_trend = f_arxiv.result() pubmed_trend = f_pubmed.result() trial_rows = f_trials.result() gene_rows = f_genes.result() # ------------------------------------------------------------------ # # Phase 2: conditional follow-up queries (depend on phase-1 results) # # ------------------------------------------------------------------ # f_cats = None f_recent = None if arxiv_trend: f_cats = ex.submit(mq.query, "arxiv", f""" SELECT primary_cat, COUNT(*) AS papers FROM papers WHERE (title ~ '(?i){topic}' OR abstract ~ '(?i){topic}') AND submitted >= '2022-01-01' GROUP BY primary_cat ORDER BY papers DESC LIMIT 6 """, verbose=True) if trial_rows: f_recent = ex.submit(mq.query, "clinicaltrials", f""" SELECT nct_id, brief_title, overall_status, phase, enrollment, start_date FROM studies WHERE (conditions ~ '(?i){topic}' OR brief_title ~ '(?i){topic}') AND overall_status = 'RECRUITING' ORDER BY start_date DESC LIMIT 5 """, verbose=True) cat_rows = f_cats.result() if f_cats else [] recent_trials = f_recent.result() if f_recent else [] # ------------------------------------------------------------------ # # Print results in logical order # # ------------------------------------------------------------------ # # ------------------------------------------------------------------ # # 1. arXiv — frontier preprint activity # # ------------------------------------------------------------------ # section("1. arXiv Preprint Frontier") if arxiv_trend: total_arxiv = sum(r["papers"] for r in arxiv_trend) print(f" {total_arxiv:,} arXiv preprints mentioning '{topic}':") vals = [r["papers"] for r in reversed(arxiv_trend)] print(f" Trend (oldest→newest): {sparkline(vals)}") for r in arxiv_trend[:5]: yr = str(r["yr"])[:4] if r["yr"] else "?" cats = r.get("categories", 0) print(f" {yr} {r['papers']:>5} papers ({cats} categories)") if cat_rows: print(f" Top categories (2022+): " + ", ".join(f"{r['primary_cat']}({r['papers']})" for r in cat_rows)) else: print(f" No arXiv preprints found for '{topic}'") # ------------------------------------------------------------------ # # 2. PubMed — peer-reviewed publication volume # # ------------------------------------------------------------------ # section("2. PubMed Peer-Reviewed Literature") if pubmed_trend: total_pubmed = sum(r["publications"] for r in pubmed_trend) print(f" {total_pubmed:,} PubMed publications since 2015:") vals = [r["publications"] for r in reversed(pubmed_trend)] print(f" Trend (2015→now): {sparkline(vals)}") for r in pubmed_trend[:5]: bar = "▓" * min(r["publications"] // 20, 30) print(f" {r['pub_year']} {r['publications']:>5} {bar}") if arxiv_trend and pubmed_trend: recent_arxiv = next((r["papers"] for r in arxiv_trend if str(r["yr"])[:4] == "2024"), 0) recent_pubmed = next((r["publications"] for r in pubmed_trend if r["pub_year"] == 2024), 0) if recent_pubmed > 0: ratio = recent_arxiv / recent_pubmed print(f"\n arXiv/PubMed ratio (2024): {ratio:.1f}x " f"— {'preprints heavily outpacing journals (fast-moving field)' if ratio > 2 else 'preprints and journals roughly in sync'}") else: print(f" No PubMed publications found for '{topic}'") # ------------------------------------------------------------------ # # 3. ClinicalTrials — translation into clinical testing # # ------------------------------------------------------------------ # section("3. Clinical Translation (ClinicalTrials.gov)") if trial_rows: total_trials = sum(r["trials"] for r in trial_rows) print(f" {total_trials:,} clinical trials for '{topic}':") active = sum(r["trials"] for r in trial_rows if r.get("overall_status") in ("RECRUITING", "ACTIVE_NOT_RECRUITING", "ENROLLING_BY_INVITATION")) completed = sum(r["trials"] for r in trial_rows if r.get("overall_status") == "COMPLETED") phase3 = sum(r["trials"] for r in trial_rows if (r.get("phase") or "").startswith("PHASE3")) print(f" Active/recruiting: {active} | Completed: {completed} | Phase 3+: {phase3}") for r in trial_rows[:8]: avg_n = f"{r['avg_enrollment']:.0f}" if r.get("avg_enrollment") else "?" print( f" {r.get('overall_status','?'):<32} " f"Phase {r.get('phase','?'):<8} " f"{r['trials']:>4} trials avg n={avg_n}" ) if recent_trials: print(f"\n Most recently opened recruiting trials:") for r in recent_trials: print(f" {r['nct_id']} Phase {r.get('phase','?'):<8} " f"n={r.get('enrollment','?'):<6} {r['brief_title'][:50]}") else: print(f" No clinical trials found for '{topic}'") # ------------------------------------------------------------------ # # 4. ClinVar — genetic architecture of the condition # # ------------------------------------------------------------------ # section("4. Genetic Variants (ClinVar)") if gene_rows: total_variants = sum(r["pathogenic_variants"] for r in gene_rows) print(f" {total_variants:,} pathogenic variants (GRCh38) linked to '{topic}':") for r in gene_rows[:10]: reviewed_pct = r["reviewed"] / r["pathogenic_variants"] * 100 bar = "▓" * min(r["pathogenic_variants"] // 10, 25) print( f" {r['gene_symbol']:<12} {r['pathogenic_variants']:>4} variants " f"{reviewed_pct:>4.0f}% multi-reviewed {bar}" ) else: print(f" No ClinVar pathogenic variants found for '{topic}'") # ------------------------------------------------------------------ # # Synthesis # # ------------------------------------------------------------------ # header("SYNTHESIS") if arxiv_trend and pubmed_trend and trial_rows and gene_rows: recent_arxiv = next((r["papers"] for r in arxiv_trend if str(r["yr"])[:4] >= "2023"), 0) recent_pubmed = next((r["publications"] for r in pubmed_trend if r["pub_year"] >= 2023), 0) total_trials = sum(r["trials"] for r in trial_rows) top_gene = gene_rows[0]["gene_symbol"] if gene_rows else "—" maturity = "early-stage" if total_trials < 20 else ("growing" if total_trials < 100 else "mature") momentum = "accelerating" if (recent_arxiv or 0) > 100 else "steady" print(f" Research area: {maturity}, {momentum}") print(f" Publication: {recent_arxiv} arXiv (2023+) / {recent_pubmed} PubMed (2023+)") print(f" Clinical: {total_trials} total trials") print(f" Key gene: {top_gene} ({gene_rows[0]['pathogenic_variants']} pathogenic variants)") print(f"\n{mq.cost_summary()}") def main(): parser = argparse.ArgumentParser(description="Research-to-clinic pipeline report") parser.add_argument("--topic", required=True, help="Research topic / condition (e.g. 'CRISPR', 'GLP-1 obesity')") parser.add_argument("--token", default=os.environ.get("MICROQUERY_TOKEN"), help="Microquery API token (or set MICROQUERY_TOKEN)") args = parser.parse_args() if not args.token: print("Error: set MICROQUERY_TOKEN or pass --token", file=sys.stderr) sys.exit(1) mq = MicroqueryClient(api_key=args.token) try: run(args.topic, mq) except QueryError as e: print(f"Query error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()