Skip to content

Commit 404afd0

Browse files
authored
Add files via upload
1 parent 0b00341 commit 404afd0

1 file changed

Lines changed: 355 additions & 0 deletions

File tree

Lines changed: 355 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"source": [
20+
"import re, json, time, random\n",
21+
"from dataclasses import dataclass\n",
22+
"from typing import Callable, Dict, Any, List, Tuple\n",
23+
"\n",
24+
"@dataclass\n",
25+
"class ToolSpec:\n",
26+
" name: str\n",
27+
" description: str\n",
28+
" inputs: Dict[str, str]\n",
29+
" outputs: Dict[str, str]\n",
30+
"\n",
31+
"def parse_doc_to_spec(name: str, doc: str) -> ToolSpec:\n",
32+
" desc = doc.strip().splitlines()[0].strip() if doc.strip() else name\n",
33+
" arg_block = \"\\n\".join([l for l in doc.splitlines() if \"--\" in l or \":\" in l])\n",
34+
" inputs = {}\n",
35+
" for line in arg_block.splitlines():\n",
36+
" m = re.findall(r\"(--?\\w[\\w-]*|\\b\\w+\\b)\\s*[:=]?\\s*(\\w+)?\", line)\n",
37+
" for key, typ in m:\n",
38+
" k = key.lstrip(\"-\")\n",
39+
" if k and k not in inputs and k not in [\"Returns\",\"Output\",\"Outputs\"]:\n",
40+
" inputs[k] = (typ or \"str\")\n",
41+
" if not inputs: inputs = {\"in\": \"str\"}\n",
42+
" return ToolSpec(name=name, description=desc, inputs=inputs, outputs={\"out\":\"json\"})"
43+
],
44+
"metadata": {
45+
"id": "Xk2RZH5ZSjHn"
46+
},
47+
"execution_count": null,
48+
"outputs": []
49+
},
50+
{
51+
"cell_type": "code",
52+
"source": [
53+
"def tool_fastqc(seq_fasta: str, min_len:int=30) -> Dict[str,Any]:\n",
54+
" seqs = [s for s in re.split(r\">[^\\n]*\\n\", seq_fasta)[1:]]\n",
55+
" lens = [len(re.sub(r\"\\s+\",\"\",s)) for s in seqs]\n",
56+
" q30 = sum(l>=min_len for l in lens)/max(1,len(lens))\n",
57+
" gc = sum(c in \"GCgc\" for s in seqs for c in s)/max(1,sum(lens))\n",
58+
" return {\"n_seqs\":len(lens),\"len_mean\":(sum(lens)/max(1,len(lens))),\"pct_q30\":q30,\"gc\":gc}\n",
59+
"\n",
60+
"def tool_bowtie2_like(ref:str, reads:str, mode:str=\"end-to-end\") -> Dict[str,Any]:\n",
61+
" def revcomp(s):\n",
62+
" t=str.maketrans(\"ACGTacgt\",\"TGCAtgca\"); return s.translate(t)[::-1]\n",
63+
" reads_list=[r for r in re.split(r\">[^\\n]*\\n\", reads)[1:]]\n",
64+
" ref_seq=\"\".join(ref.splitlines()[1:])\n",
65+
" hits=[]\n",
66+
" for i,r in enumerate(reads_list):\n",
67+
" rseq=\"\".join(r.split())\n",
68+
" aligned = (rseq in ref_seq) or (revcomp(rseq) in ref_seq)\n",
69+
" hits.append({\"read_id\":i,\"aligned\":bool(aligned),\"pos\":ref_seq.find(rseq)})\n",
70+
" return {\"n\":len(hits),\"aligned\":sum(h[\"aligned\"] for h in hits),\"mode\":mode,\"hits\":hits}\n",
71+
"\n",
72+
"def tool_bcftools_like(ref:str, alt:str, win:int=15) -> Dict[str,Any]:\n",
73+
" ref_seq=\"\".join(ref.splitlines()[1:]); alt_seq=\"\".join(alt.splitlines()[1:])\n",
74+
" n=min(len(ref_seq),len(alt_seq)); vars=[]\n",
75+
" for i in range(n):\n",
76+
" if ref_seq[i]!=alt_seq[i]: vars.append({\"pos\":i,\"ref\":ref_seq[i],\"alt\":alt_seq[i]})\n",
77+
" return {\"n_sites\":n,\"n_var\":len(vars),\"variants\":vars[:win]}\n",
78+
"\n",
79+
"FASTQC_DOC = \"\"\"FastQC-like quality control for FASTA\n",
80+
"--seq_fasta: str --min_len: int Outputs: json\"\"\"\n",
81+
"BOWTIE_DOC = \"\"\"Bowtie2-like aligner\n",
82+
"--ref: str --reads: str --mode: str Outputs: json\"\"\"\n",
83+
"BCF_DOC = \"\"\"bcftools-like variant caller\n",
84+
"--ref: str --alt: str --win: int Outputs: json\"\"\""
85+
],
86+
"metadata": {
87+
"id": "iH7D9NKDSkHx"
88+
},
89+
"execution_count": null,
90+
"outputs": []
91+
},
92+
{
93+
"cell_type": "code",
94+
"source": [
95+
"@dataclass\n",
96+
"class MCPTool:\n",
97+
" spec: ToolSpec\n",
98+
" fn: Callable[..., Dict[str,Any]]\n",
99+
"\n",
100+
"class MCPServer:\n",
101+
" def __init__(self): self.tools: Dict[str,MCPTool] = {}\n",
102+
" def register(self, name:str, doc:str, fn:Callable[...,Dict[str,Any]]):\n",
103+
" spec = parse_doc_to_spec(name, doc); self.tools[name]=MCPTool(spec, fn)\n",
104+
" def list_tools(self) -> List[Dict[str,Any]]:\n",
105+
" return [dict(name=t.spec.name, description=t.spec.description, inputs=t.spec.inputs, outputs=t.spec.outputs) for t in self.tools.values()]\n",
106+
" def call_tool(self, name:str, args:Dict[str,Any]) -> Dict[str,Any]:\n",
107+
" if name not in self.tools: raise KeyError(f\"tool {name} not found\")\n",
108+
" spec = self.tools[name].spec\n",
109+
" kwargs={k:args.get(k) for k in spec.inputs.keys()}\n",
110+
" return self.tools[name].fn(**kwargs)\n",
111+
"\n",
112+
"server=MCPServer()\n",
113+
"server.register(\"fastqc\", FASTQC_DOC, tool_fastqc)\n",
114+
"server.register(\"bowtie2\", BOWTIE_DOC, tool_bowtie2_like)\n",
115+
"server.register(\"bcftools\", BCF_DOC, tool_bcftools_like)\n",
116+
"\n",
117+
"Task = Tuple[str, Dict[str,Any]]\n",
118+
"PIPELINES = {\n",
119+
" \"rnaseq_qc_align_call\":[\n",
120+
" (\"fastqc\", {\"seq_fasta\":\"{reads}\", \"min_len\":30}),\n",
121+
" (\"bowtie2\", {\"ref\":\"{ref}\", \"reads\":\"{reads}\", \"mode\":\"end-to-end\"}),\n",
122+
" (\"bcftools\", {\"ref\":\"{ref}\", \"alt\":\"{alt}\", \"win\":15}),\n",
123+
" ]\n",
124+
"}\n",
125+
"\n",
126+
"def compile_pipeline(nl_request:str) -> List[Task]:\n",
127+
" key = \"rnaseq_qc_align_call\" if re.search(r\"rna|qc|align|variant|call\", nl_request, re.I) else \"rnaseq_qc_align_call\"\n",
128+
" return PIPELINES[key]"
129+
],
130+
"metadata": {
131+
"id": "CGM9J07zSm-p"
132+
},
133+
"execution_count": null,
134+
"outputs": []
135+
},
136+
{
137+
"cell_type": "code",
138+
"source": [
139+
"def mk_fasta(header:str, seq:str)->str: return f\">{header}\\n{seq}\\n\"\n",
140+
"random.seed(0)\n",
141+
"REF_SEQ=\"\".join(random.choice(\"ACGT\") for _ in range(300))\n",
142+
"REF = mk_fasta(\"ref\",REF_SEQ)\n",
143+
"READS = mk_fasta(\"r1\", REF_SEQ[50:130]) + mk_fasta(\"r2\",\"ACGT\"*15) + mk_fasta(\"r3\", REF_SEQ[180:240])\n",
144+
"ALT = mk_fasta(\"alt\", REF_SEQ[:150] + \"T\" + REF_SEQ[151:])\n",
145+
"\n",
146+
"def run_pipeline(nl:str, ctx:Dict[str,str]) -> Dict[str,Any]:\n",
147+
" plan=compile_pipeline(nl); results=[]; t0=time.time()\n",
148+
" for name, arg_tpl in plan:\n",
149+
" args={k:(v.format(**ctx) if isinstance(v,str) else v) for k,v in arg_tpl.items()}\n",
150+
" out=server.call_tool(name, args)\n",
151+
" results.append({\"tool\":name,\"args\":args,\"output\":out})\n",
152+
" return {\"request\":nl,\"elapsed_s\":round(time.time()-t0,4),\"results\":results}"
153+
],
154+
"metadata": {
155+
"id": "eb6JS_luSo19"
156+
},
157+
"execution_count": null,
158+
"outputs": []
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": 2,
163+
"metadata": {
164+
"colab": {
165+
"base_uri": "https://localhost:8080/"
166+
},
167+
"id": "nb0-nNYkRHuC",
168+
"outputId": "e42167a9-40f4-45b2-a5d9-9ed195bd6d84"
169+
},
170+
"outputs": [
171+
{
172+
"output_type": "stream",
173+
"name": "stdout",
174+
"text": [
175+
"== TOOLS ==\n",
176+
"[\n",
177+
" {\n",
178+
" \"name\": \"fastqc\",\n",
179+
" \"description\": \"FastQC-like quality control for FASTA\",\n",
180+
" \"inputs\": {\n",
181+
" \"seq_fasta\": \"str\",\n",
182+
" \"min_len\": \"int\"\n",
183+
" },\n",
184+
" \"outputs\": {\n",
185+
" \"out\": \"json\"\n",
186+
" }\n",
187+
" },\n",
188+
" {\n",
189+
" \"name\": \"bowtie2\",\n",
190+
" \"description\": \"Bowtie2-like aligner\",\n",
191+
" \"inputs\": {\n",
192+
" \"ref\": \"str\",\n",
193+
" \"reads\": \"str\",\n",
194+
" \"mode\": \"str\"\n",
195+
" },\n",
196+
" \"outputs\": {\n",
197+
" \"out\": \"json\"\n",
198+
" }\n",
199+
" },\n",
200+
" {\n",
201+
" \"name\": \"bcftools\",\n",
202+
" \"description\": \"bcftools-like variant caller\",\n",
203+
" \"inputs\": {\n",
204+
" \"ref\": \"str\",\n",
205+
" \"alt\": \"str\",\n",
206+
" \"win\": \"int\"\n",
207+
" },\n",
208+
" \"outputs\": {\n",
209+
" \"out\": \"json\"\n",
210+
" }\n",
211+
" }\n",
212+
"]\n",
213+
"\n",
214+
"== INDIVIDUAL BENCH ==\n",
215+
"[\n",
216+
" {\n",
217+
" \"tool\": \"fastqc\",\n",
218+
" \"ok\": true,\n",
219+
" \"ms\": 0,\n",
220+
" \"out_keys\": [\n",
221+
" \"n_seqs\",\n",
222+
" \"len_mean\",\n",
223+
" \"pct_q30\",\n",
224+
" \"gc\"\n",
225+
" ],\n",
226+
" \"err\": null\n",
227+
" },\n",
228+
" {\n",
229+
" \"tool\": \"bowtie2\",\n",
230+
" \"ok\": true,\n",
231+
" \"ms\": 0,\n",
232+
" \"out_keys\": [\n",
233+
" \"n\",\n",
234+
" \"aligned\",\n",
235+
" \"mode\",\n",
236+
" \"hits\"\n",
237+
" ],\n",
238+
" \"err\": null\n",
239+
" },\n",
240+
" {\n",
241+
" \"tool\": \"bcftools\",\n",
242+
" \"ok\": true,\n",
243+
" \"ms\": 0,\n",
244+
" \"out_keys\": [\n",
245+
" \"n_sites\",\n",
246+
" \"n_var\",\n",
247+
" \"variants\"\n",
248+
" ],\n",
249+
" \"err\": null\n",
250+
" }\n",
251+
"]\n",
252+
"\n",
253+
"== PIPELINE BENCH ==\n",
254+
"{\n",
255+
" \"pipeline\": \"rnaseq_qc_align_call\",\n",
256+
" \"ok\": true,\n",
257+
" \"ms\": 0,\n",
258+
" \"n_steps\": 3\n",
259+
"}\n",
260+
"\n",
261+
"== PIPELINE RUN ==\n",
262+
"{\n",
263+
" \"request\": \"Run RNA-seq QC, align, and variant call.\",\n",
264+
" \"elapsed_s\": 0.0001,\n",
265+
" \"results\": [\n",
266+
" {\n",
267+
" \"tool\": \"fastqc\",\n",
268+
" \"args\": {\n",
269+
" \"seq_fasta\": \">r1\\nGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTC\\n>r2\\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\\n>r3\\nTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGG\\n\",\n",
270+
" \"min_len\": 30\n",
271+
" },\n",
272+
" \"output\": {\n",
273+
" \"n_seqs\": 3,\n",
274+
" \"len_mean\": 66.66666666666667,\n",
275+
" \"pct_q30\": 1.0,\n",
276+
" \"gc\": 0.46\n",
277+
" }\n",
278+
" },\n",
279+
" {\n",
280+
" \"tool\": \"bowtie2\",\n",
281+
" \"args\": {\n",
282+
" \"ref\": \">ref\\nTTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAA\\n\",\n",
283+
" \"reads\": \">r1\\nGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTC\\n>r2\\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\\n>r3\\nTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGG\\n\",\n",
284+
" \"mode\": \"end-to-end\"\n",
285+
" },\n",
286+
" \"output\": {\n",
287+
" \"n\": 3,\n",
288+
" \"aligned\": 2,\n",
289+
" \"mode\": \"end-to-end\",\n",
290+
" \"hits\": [\n",
291+
" {\n",
292+
" \"read_id\": 0,\n",
293+
" \"aligned\": true,\n",
294+
" \"pos\": 50\n",
295+
" },\n",
296+
" {\n",
297+
" \"read_id\": 1,\n",
298+
" \"aligned\": false,\n",
299+
" \"pos\": -1\n",
300+
" },\n",
301+
" {\n",
302+
" \"read_id\": 2,\n",
303+
" \"aligned\": true,\n",
304+
" \"pos\": 180\n",
305+
" }\n",
306+
" ]\n",
307+
" }\n",
308+
" },\n",
309+
" {\n",
310+
" \"tool\": \"bcftools\",\n",
311+
" \"args\": {\n",
312+
" \"ref\": \">ref\\nTTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAA\\n\",\n",
313+
" \"alt\": \">alt\\nTTAGTTGTGCCGCAGCGAAGTAGTGCTTGAAATATGCGACCCCTAAGTAGGAGCGTATGCGCCCAGTAACCAATGCCTGTTGAGATGCCAGACGCGTAACCAAAACATAGAAACCATCAATAGACAGGTCATAATCGGTCCACCGGATCATTGGTGCATAGAGCCTGGGCGTTAACGCCCTTTATTACTAGCTTAATGGTATCACATTGACAAACACGGCATTAAGTAGCGACGAAACGGGATTTGCCTGACCGGGGAGAAGCCGGTCGATCAGCAGTGGTAATTGGATATTAGGCCTAA\\n\",\n",
314+
" \"win\": 15\n",
315+
" },\n",
316+
" \"output\": {\n",
317+
" \"n_sites\": 300,\n",
318+
" \"n_var\": 0,\n",
319+
" \"variants\": []\n",
320+
" }\n",
321+
" }\n",
322+
" ]\n",
323+
"}\n"
324+
]
325+
}
326+
],
327+
"source": [
328+
"def bench_individual() -> List[Dict[str,Any]]:\n",
329+
" cases=[\n",
330+
" (\"fastqc\", {\"seq_fasta\":READS,\"min_len\":25}),\n",
331+
" (\"bowtie2\", {\"ref\":REF,\"reads\":READS,\"mode\":\"end-to-end\"}),\n",
332+
" (\"bcftools\", {\"ref\":REF,\"alt\":ALT,\"win\":10}),\n",
333+
" ]\n",
334+
" rows=[]\n",
335+
" for name,args in cases:\n",
336+
" t0=time.time(); ok=True; err=None; out=None\n",
337+
" try: out=server.call_tool(name,args)\n",
338+
" except Exception as e: ok=False; err=str(e)\n",
339+
" rows.append({\"tool\":name,\"ok\":ok,\"ms\":int((time.time()-t0)*1000),\"out_keys\":list(out.keys()) if ok else [],\"err\":err})\n",
340+
" return rows\n",
341+
"\n",
342+
"def bench_pipeline() -> Dict[str,Any]:\n",
343+
" t0=time.time()\n",
344+
" res=run_pipeline(\"Run RNA-seq QC, align, and variant call.\", {\"ref\":REF,\"reads\":READS,\"alt\":ALT})\n",
345+
" ok = all(step[\"output\"] for step in res[\"results\"])\n",
346+
" return {\"pipeline\":\"rnaseq_qc_align_call\",\"ok\":ok,\"ms\":int((time.time()-t0)*1000),\"n_steps\":len(res[\"results\"])}\n",
347+
"\n",
348+
"print(\"== TOOLS ==\"); print(json.dumps(server.list_tools(), indent=2))\n",
349+
"print(\"\\n== INDIVIDUAL BENCH ==\"); print(json.dumps(bench_individual(), indent=2))\n",
350+
"print(\"\\n== PIPELINE BENCH ==\"); print(json.dumps(bench_pipeline(), indent=2))\n",
351+
"print(\"\\n== PIPELINE RUN ==\"); print(json.dumps(run_pipeline(\"Run RNA-seq QC, align, and variant call.\", {\"ref\":REF,\"reads\":READS,\"alt\":ALT}), indent=2))"
352+
]
353+
}
354+
]
355+
}

0 commit comments

Comments
 (0)