1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " code" ,
19+ "source" : [
20+ " !pip install -q transformers accelerate sentencepiece nest_asyncio\n " ,
21+ " import torch, asyncio, uuid\n " ,
22+ " from transformers import pipeline\n " ,
23+ " import nest_asyncio\n " ,
24+ " nest_asyncio.apply()"
25+ ],
26+ "metadata" : {
27+ "id" : " A8kEftfYLR-p"
28+ },
29+ "execution_count" : null ,
30+ "outputs" : []
31+ },
32+ {
33+ "cell_type" : " code" ,
34+ "source" : [
35+ " class LocalLLM:\n " ,
36+ " def __init__(self, model_name=\" google/flan-t5-small\" , max_new_tokens=128):\n " ,
37+ " self.pipe = pipeline(\" text2text-generation\" , model=model_name, device=0 if torch.cuda.is_available() else -1)\n " ,
38+ " self.max_new_tokens = max_new_tokens\n " ,
39+ " def generate(self, prompt: str) -> str:\n " ,
40+ " out = self.pipe(prompt, max_new_tokens=self.max_new_tokens, temperature=0.0)[0][\" generated_text\" ]\n " ,
41+ " return out.strip()\n " ,
42+ " \n " ,
43+ " class VirtualComputer:\n " ,
44+ " def __init__(self):\n " ,
45+ " self.apps = {\" browser\" : \" https://example.com\" , \" notes\" : \"\" , \" mail\" : [\" Welcome to CUA\" , \" Invoice #221\" , \" Weekly Report\" ]}\n " ,
46+ " self.focus = \" browser\"\n " ,
47+ " self.screen = \" Browser open at https://example.com\\ nSearch bar focused.\"\n " ,
48+ " self.action_log = []\n " ,
49+ " def screenshot(self):\n " ,
50+ " return f\" FOCUS:{self.focus}\\ nSCREEN:\\ n{self.screen}\\ nAPPS:{list(self.apps.keys())}\"\n " ,
51+ " def click(self, target:str):\n " ,
52+ " if target in self.apps:\n " ,
53+ " self.focus = target\n " ,
54+ " if target==\" browser\" :\n " ,
55+ " self.screen = f\" Browser tab: {self.apps['browser']}\\ nAddress bar focused.\"\n " ,
56+ " elif target==\" notes\" :\n " ,
57+ " self.screen = f\" Notes App\\ nCurrent notes:\\ n{self.apps['notes']}\"\n " ,
58+ " elif target==\" mail\" :\n " ,
59+ " inbox = \"\\ n\" .join(f\" - {s}\" for s in self.apps['mail'])\n " ,
60+ " self.screen = f\" Mail App Inbox:\\ n{inbox}\\ n(Read-only preview)\"\n " ,
61+ " else:\n " ,
62+ " self.screen += f\"\\ nClicked '{target}'.\"\n " ,
63+ " self.action_log.append({\" type\" :\" click\" ,\" target\" :target})\n " ,
64+ " def type(self, text:str):\n " ,
65+ " if self.focus==\" browser\" :\n " ,
66+ " self.apps[\" browser\" ] = text\n " ,
67+ " self.screen = f\" Browser tab now at {text}\\ nPage headline: Example Domain\"\n " ,
68+ " elif self.focus==\" notes\" :\n " ,
69+ " self.apps[\" notes\" ] += (\"\\ n\" +text)\n " ,
70+ " self.screen = f\" Notes App\\ nCurrent notes:\\ n{self.apps['notes']}\"\n " ,
71+ " else:\n " ,
72+ " self.screen += f\"\\ nTyped '{text}' but no editable field.\"\n " ,
73+ " self.action_log.append({\" type\" :\" type\" ,\" text\" :text})"
74+ ],
75+ "metadata" : {
76+ "id" : " udvEjvRiLRyj"
77+ },
78+ "execution_count" : null ,
79+ "outputs" : []
80+ },
81+ {
82+ "cell_type" : " code" ,
83+ "source" : [
84+ " class ComputerTool:\n " ,
85+ " def __init__(self, computer:VirtualComputer):\n " ,
86+ " self.computer = computer\n " ,
87+ " def run(self, command:str, argument:str=\"\" ):\n " ,
88+ " if command==\" click\" :\n " ,
89+ " self.computer.click(argument)\n " ,
90+ " return {\" status\" :\" completed\" ,\" result\" :f\" clicked {argument}\" }\n " ,
91+ " if command==\" type\" :\n " ,
92+ " self.computer.type(argument)\n " ,
93+ " return {\" status\" :\" completed\" ,\" result\" :f\" typed {argument}\" }\n " ,
94+ " if command==\" screenshot\" :\n " ,
95+ " snap = self.computer.screenshot()\n " ,
96+ " return {\" status\" :\" completed\" ,\" result\" :snap}\n " ,
97+ " return {\" status\" :\" error\" ,\" result\" :f\" unknown command {command}\" }"
98+ ],
99+ "metadata" : {
100+ "id" : " 7jNxJAytLRuq"
101+ },
102+ "execution_count" : null ,
103+ "outputs" : []
104+ },
105+ {
106+ "cell_type" : " code" ,
107+ "source" : [
108+ " class ComputerAgent:\n " ,
109+ " def __init__(self, llm:LocalLLM, tool:ComputerTool, max_trajectory_budget:float=5.0):\n " ,
110+ " self.llm = llm\n " ,
111+ " self.tool = tool\n " ,
112+ " self.max_trajectory_budget = max_trajectory_budget\n " ,
113+ " async def run(self, messages):\n " ,
114+ " user_goal = messages[-1][\" content\" ]\n " ,
115+ " steps_remaining = int(self.max_trajectory_budget)\n " ,
116+ " output_events = []\n " ,
117+ " total_prompt_tokens = 0\n " ,
118+ " total_completion_tokens = 0\n " ,
119+ " while steps_remaining>0:\n " ,
120+ " screen = self.tool.computer.screenshot()\n " ,
121+ " prompt = (\n " ,
122+ " \" You are a computer-use agent.\\ n\"\n " ,
123+ " f\" User goal: {user_goal}\\ n\"\n " ,
124+ " f\" Current screen:\\ n{screen}\\ n\\ n\"\n " ,
125+ " \" Think step-by-step.\\ n\"\n " ,
126+ " \" Reply with: ACTION <click/type/screenshot> ARG <target or text> THEN <assistant message>.\\ n\"\n " ,
127+ " )\n " ,
128+ " thought = self.llm.generate(prompt)\n " ,
129+ " total_prompt_tokens += len(prompt.split())\n " ,
130+ " total_completion_tokens += len(thought.split())\n " ,
131+ " action=\" screenshot\" ; arg=\"\" ; assistant_msg=\" Working...\"\n " ,
132+ " for line in thought.splitlines():\n " ,
133+ " if line.strip().startswith(\" ACTION \" ):\n " ,
134+ " after = line.split(\" ACTION \" ,1)[1]\n " ,
135+ " action = after.split()[0].strip()\n " ,
136+ " if \" ARG \" in line:\n " ,
137+ " part = line.split(\" ARG \" ,1)[1]\n " ,
138+ " if \" THEN \" in part:\n " ,
139+ " arg = part.split(\" THEN \" )[0].strip()\n " ,
140+ " else:\n " ,
141+ " arg = part.strip()\n " ,
142+ " if \" THEN \" in line:\n " ,
143+ " assistant_msg = line.split(\" THEN \" ,1)[1].strip()\n " ,
144+ " output_events.append({\" summary\" :[{\" text\" :assistant_msg,\" type\" :\" summary_text\" }],\" type\" :\" reasoning\" })\n " ,
145+ " call_id = \" call_\" +uuid.uuid4().hex[:16]\n " ,
146+ " tool_res = self.tool.run(action, arg)\n " ,
147+ " output_events.append({\" action\" :{\" type\" :action,\" text\" :arg},\" call_id\" :call_id,\" status\" :tool_res[\" status\" ],\" type\" :\" computer_call\" })\n " ,
148+ " snap = self.tool.computer.screenshot()\n " ,
149+ " output_events.append({\" type\" :\" computer_call_output\" ,\" call_id\" :call_id,\" output\" :{\" type\" :\" input_image\" ,\" image_url\" :snap}})\n " ,
150+ " output_events.append({\" type\" :\" message\" ,\" role\" :\" assistant\" ,\" content\" :[{\" type\" :\" output_text\" ,\" text\" :assistant_msg}]})\n " ,
151+ " if \" done\" in assistant_msg.lower() or \" here is\" in assistant_msg.lower():\n " ,
152+ " break\n " ,
153+ " steps_remaining -= 1\n " ,
154+ " usage = {\" prompt_tokens\" : total_prompt_tokens,\" completion_tokens\" : total_completion_tokens,\" total_tokens\" : total_prompt_tokens + total_completion_tokens,\" response_cost\" : 0.0}\n " ,
155+ " yield {\" output\" : output_events, \" usage\" : usage}"
156+ ],
157+ "metadata" : {
158+ "id" : " TelJVoUdLRrX"
159+ },
160+ "execution_count" : null ,
161+ "outputs" : []
162+ },
163+ {
164+ "cell_type" : " code" ,
165+ "execution_count" : 3 ,
166+ "metadata" : {
167+ "colab" : {
168+ "base_uri" : " https://localhost:8080/"
169+ },
170+ "id" : " IV-CTHMk8S-U" ,
171+ "outputId" : " 8ae824e7-1e51-4731-e43e-4d118c2f4b92"
172+ },
173+ "outputs" : [
174+ {
175+ "output_type" : " stream" ,
176+ "name" : " stderr" ,
177+ "text" : [
178+ " Device set to use cpu\n "
179+ ]
180+ },
181+ {
182+ "output_type" : " stream" ,
183+ "name" : " stdout" ,
184+ "text" : [
185+ " ==== STREAM RESULT ====\n " ,
186+ " [TOOL CALL] screenshot -> [completed]\n " ,
187+ " SCREEN AFTER ACTION:\n " ,
188+ " FOCUS:browser\n " ,
189+ " SCREEN:\n " ,
190+ " Browser open at https://example.com\n " ,
191+ " Search bar focused.\n " ,
192+ " APPS:['browser', 'notes', 'mail'] ...\n " ,
193+ " \n " ,
194+ " ASSISTANT: Working... \n " ,
195+ " \n " ,
196+ " [TOOL CALL] screenshot -> [completed]\n " ,
197+ " SCREEN AFTER ACTION:\n " ,
198+ " FOCUS:browser\n " ,
199+ " SCREEN:\n " ,
200+ " Browser open at https://example.com\n " ,
201+ " Search bar focused.\n " ,
202+ " APPS:['browser', 'notes', 'mail'] ...\n " ,
203+ " \n " ,
204+ " ASSISTANT: Working... \n " ,
205+ " \n " ,
206+ " [TOOL CALL] screenshot -> [completed]\n " ,
207+ " SCREEN AFTER ACTION:\n " ,
208+ " FOCUS:browser\n " ,
209+ " SCREEN:\n " ,
210+ " Browser open at https://example.com\n " ,
211+ " Search bar focused.\n " ,
212+ " APPS:['browser', 'notes', 'mail'] ...\n " ,
213+ " \n " ,
214+ " ASSISTANT: Working... \n " ,
215+ " \n " ,
216+ " [TOOL CALL] screenshot -> [completed]\n " ,
217+ " SCREEN AFTER ACTION:\n " ,
218+ " FOCUS:browser\n " ,
219+ " SCREEN:\n " ,
220+ " Browser open at https://example.com\n " ,
221+ " Search bar focused.\n " ,
222+ " APPS:['browser', 'notes', 'mail'] ...\n " ,
223+ " \n " ,
224+ " ASSISTANT: Working... \n " ,
225+ " \n " ,
226+ " USAGE: {'prompt_tokens': 164, 'completion_tokens': 260, 'total_tokens': 424, 'response_cost': 0.0}\n "
227+ ]
228+ }
229+ ],
230+ "source" : [
231+ " async def main_demo():\n " ,
232+ " computer = VirtualComputer()\n " ,
233+ " tool = ComputerTool(computer)\n " ,
234+ " llm = LocalLLM()\n " ,
235+ " agent = ComputerAgent(llm, tool, max_trajectory_budget=4)\n " ,
236+ " messages=[{\" role\" :\" user\" ,\" content\" :\" Open mail, read inbox subjects, and summarize.\" }]\n " ,
237+ " async for result in agent.run(messages):\n " ,
238+ " print(\" ==== STREAM RESULT ====\" )\n " ,
239+ " for event in result[\" output\" ]:\n " ,
240+ " if event[\" type\" ]==\" computer_call\" :\n " ,
241+ " a = event.get(\" action\" ,{})\n " ,
242+ " print(f\" [TOOL CALL] {a.get('type')} -> {a.get('text')} [{event.get('status')}]\" )\n " ,
243+ " if event[\" type\" ]==\" computer_call_output\" :\n " ,
244+ " snap = event[\" output\" ][\" image_url\" ]\n " ,
245+ " print(\" SCREEN AFTER ACTION:\\ n\" , snap[:400],\" ...\\ n\" )\n " ,
246+ " if event[\" type\" ]==\" message\" :\n " ,
247+ " print(\" ASSISTANT:\" , event[\" content\" ][0][\" text\" ], \"\\ n\" )\n " ,
248+ " print(\" USAGE:\" , result[\" usage\" ])\n " ,
249+ " \n " ,
250+ " loop = asyncio.get_event_loop()\n " ,
251+ " loop.run_until_complete(main_demo())"
252+ ]
253+ }
254+ ]
255+ }
0 commit comments