Add files via upload

Marktechpost · web-flow · commit dc263fe89e82 · 2025-10-25T03:48:11.000-07:00
diff --git a/AI Agents Codes/Computer_Use_Agent_Local_AI_Marktechpost.ipynb b/AI Agents Codes/Computer_Use_Agent_Local_AI_Marktechpost.ipynb
@@ -0,0 +1,255 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -q transformers accelerate sentencepiece nest_asyncio\n",
+        "import torch, asyncio, uuid\n",
+        "from transformers import pipeline\n",
+        "import nest_asyncio\n",
+        "nest_asyncio.apply()"
+      ],
+      "metadata": {
+        "id": "A8kEftfYLR-p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class LocalLLM:\n",
+        "    def __init__(self, model_name=\"google/flan-t5-small\", max_new_tokens=128):\n",
+        "        self.pipe = pipeline(\"text2text-generation\", model=model_name, device=0 if torch.cuda.is_available() else -1)\n",
+        "        self.max_new_tokens = max_new_tokens\n",
+        "    def generate(self, prompt: str) -> str:\n",
+        "        out = self.pipe(prompt, max_new_tokens=self.max_new_tokens, temperature=0.0)[0][\"generated_text\"]\n",
+        "        return out.strip()\n",
+        "\n",
+        "class VirtualComputer:\n",
+        "    def __init__(self):\n",
+        "        self.apps = {\"browser\": \"https://example.com\", \"notes\": \"\", \"mail\": [\"Welcome to CUA\", \"Invoice #221\", \"Weekly Report\"]}\n",
+        "        self.focus = \"browser\"\n",
+        "        self.screen = \"Browser open at https://example.com\\nSearch bar focused.\"\n",
+        "        self.action_log = []\n",
+        "    def screenshot(self):\n",
+        "        return f\"FOCUS:{self.focus}\\nSCREEN:\\n{self.screen}\\nAPPS:{list(self.apps.keys())}\"\n",
+        "    def click(self, target:str):\n",
+        "        if target in self.apps:\n",
+        "            self.focus = target\n",
+        "            if target==\"browser\":\n",
+        "                self.screen = f\"Browser tab: {self.apps['browser']}\\nAddress bar focused.\"\n",
+        "            elif target==\"notes\":\n",
+        "                self.screen = f\"Notes App\\nCurrent notes:\\n{self.apps['notes']}\"\n",
+        "            elif target==\"mail\":\n",
+        "                inbox = \"\\n\".join(f\"- {s}\" for s in self.apps['mail'])\n",
+        "                self.screen = f\"Mail App Inbox:\\n{inbox}\\n(Read-only preview)\"\n",
+        "        else:\n",
+        "            self.screen += f\"\\nClicked '{target}'.\"\n",
+        "        self.action_log.append({\"type\":\"click\",\"target\":target})\n",
+        "    def type(self, text:str):\n",
+        "        if self.focus==\"browser\":\n",
+        "            self.apps[\"browser\"] = text\n",
+        "            self.screen = f\"Browser tab now at {text}\\nPage headline: Example Domain\"\n",
+        "        elif self.focus==\"notes\":\n",
+        "            self.apps[\"notes\"] += (\"\\n\"+text)\n",
+        "            self.screen = f\"Notes App\\nCurrent notes:\\n{self.apps['notes']}\"\n",
+        "        else:\n",
+        "            self.screen += f\"\\nTyped '{text}' but no editable field.\"\n",
+        "        self.action_log.append({\"type\":\"type\",\"text\":text})"
+      ],
+      "metadata": {
+        "id": "udvEjvRiLRyj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class ComputerTool:\n",
+        "    def __init__(self, computer:VirtualComputer):\n",
+        "        self.computer = computer\n",
+        "    def run(self, command:str, argument:str=\"\"):\n",
+        "        if command==\"click\":\n",
+        "            self.computer.click(argument)\n",
+        "            return {\"status\":\"completed\",\"result\":f\"clicked {argument}\"}\n",
+        "        if command==\"type\":\n",
+        "            self.computer.type(argument)\n",
+        "            return {\"status\":\"completed\",\"result\":f\"typed {argument}\"}\n",
+        "        if command==\"screenshot\":\n",
+        "            snap = self.computer.screenshot()\n",
+        "            return {\"status\":\"completed\",\"result\":snap}\n",
+        "        return {\"status\":\"error\",\"result\":f\"unknown command {command}\"}"
+      ],
+      "metadata": {
+        "id": "7jNxJAytLRuq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class ComputerAgent:\n",
+        "    def __init__(self, llm:LocalLLM, tool:ComputerTool, max_trajectory_budget:float=5.0):\n",
+        "        self.llm = llm\n",
+        "        self.tool = tool\n",
+        "        self.max_trajectory_budget = max_trajectory_budget\n",
+        "    async def run(self, messages):\n",
+        "        user_goal = messages[-1][\"content\"]\n",
+        "        steps_remaining = int(self.max_trajectory_budget)\n",
+        "        output_events = []\n",
+        "        total_prompt_tokens = 0\n",
+        "        total_completion_tokens = 0\n",
+        "        while steps_remaining>0:\n",
+        "            screen = self.tool.computer.screenshot()\n",
+        "            prompt = (\n",
+        "                \"You are a computer-use agent.\\n\"\n",
+        "                f\"User goal: {user_goal}\\n\"\n",
+        "                f\"Current screen:\\n{screen}\\n\\n\"\n",
+        "                \"Think step-by-step.\\n\"\n",
+        "                \"Reply with: ACTION <click/type/screenshot> ARG <target or text> THEN <assistant message>.\\n\"\n",
+        "            )\n",
+        "            thought = self.llm.generate(prompt)\n",
+        "            total_prompt_tokens += len(prompt.split())\n",
+        "            total_completion_tokens += len(thought.split())\n",
+        "            action=\"screenshot\"; arg=\"\"; assistant_msg=\"Working...\"\n",
+        "            for line in thought.splitlines():\n",
+        "                if line.strip().startswith(\"ACTION \"):\n",
+        "                    after = line.split(\"ACTION \",1)[1]\n",
+        "                    action = after.split()[0].strip()\n",
+        "                if \"ARG \" in line:\n",
+        "                    part = line.split(\"ARG \",1)[1]\n",
+        "                    if \" THEN \" in part:\n",
+        "                        arg = part.split(\" THEN \")[0].strip()\n",
+        "                    else:\n",
+        "                        arg = part.strip()\n",
+        "                if \"THEN \" in line:\n",
+        "                    assistant_msg = line.split(\"THEN \",1)[1].strip()\n",
+        "            output_events.append({\"summary\":[{\"text\":assistant_msg,\"type\":\"summary_text\"}],\"type\":\"reasoning\"})\n",
+        "            call_id = \"call_\"+uuid.uuid4().hex[:16]\n",
+        "            tool_res = self.tool.run(action, arg)\n",
+        "            output_events.append({\"action\":{\"type\":action,\"text\":arg},\"call_id\":call_id,\"status\":tool_res[\"status\"],\"type\":\"computer_call\"})\n",
+        "            snap = self.tool.computer.screenshot()\n",
+        "            output_events.append({\"type\":\"computer_call_output\",\"call_id\":call_id,\"output\":{\"type\":\"input_image\",\"image_url\":snap}})\n",
+        "            output_events.append({\"type\":\"message\",\"role\":\"assistant\",\"content\":[{\"type\":\"output_text\",\"text\":assistant_msg}]})\n",
+        "            if \"done\" in assistant_msg.lower() or \"here is\" in assistant_msg.lower():\n",
+        "                break\n",
+        "            steps_remaining -= 1\n",
+        "        usage = {\"prompt_tokens\": total_prompt_tokens,\"completion_tokens\": total_completion_tokens,\"total_tokens\": total_prompt_tokens + total_completion_tokens,\"response_cost\": 0.0}\n",
+        "        yield {\"output\": output_events, \"usage\": usage}"
+      ],
+      "metadata": {
+        "id": "TelJVoUdLRrX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "IV-CTHMk8S-U",
+        "outputId": "8ae824e7-1e51-4731-e43e-4d118c2f4b92"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Device set to use cpu\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "==== STREAM RESULT ====\n",
+            "[TOOL CALL] screenshot ->  [completed]\n",
+            "SCREEN AFTER ACTION:\n",
+            " FOCUS:browser\n",
+            "SCREEN:\n",
+            "Browser open at https://example.com\n",
+            "Search bar focused.\n",
+            "APPS:['browser', 'notes', 'mail'] ...\n",
+            "\n",
+            "ASSISTANT: Working... \n",
+            "\n",
+            "[TOOL CALL] screenshot ->  [completed]\n",
+            "SCREEN AFTER ACTION:\n",
+            " FOCUS:browser\n",
+            "SCREEN:\n",
+            "Browser open at https://example.com\n",
+            "Search bar focused.\n",
+            "APPS:['browser', 'notes', 'mail'] ...\n",
+            "\n",
+            "ASSISTANT: Working... \n",
+            "\n",
+            "[TOOL CALL] screenshot ->  [completed]\n",
+            "SCREEN AFTER ACTION:\n",
+            " FOCUS:browser\n",
+            "SCREEN:\n",
+            "Browser open at https://example.com\n",
+            "Search bar focused.\n",
+            "APPS:['browser', 'notes', 'mail'] ...\n",
+            "\n",
+            "ASSISTANT: Working... \n",
+            "\n",
+            "[TOOL CALL] screenshot ->  [completed]\n",
+            "SCREEN AFTER ACTION:\n",
+            " FOCUS:browser\n",
+            "SCREEN:\n",
+            "Browser open at https://example.com\n",
+            "Search bar focused.\n",
+            "APPS:['browser', 'notes', 'mail'] ...\n",
+            "\n",
+            "ASSISTANT: Working... \n",
+            "\n",
+            "USAGE: {'prompt_tokens': 164, 'completion_tokens': 260, 'total_tokens': 424, 'response_cost': 0.0}\n"
+          ]
+        }
+      ],
+      "source": [
+        "async def main_demo():\n",
+        "    computer = VirtualComputer()\n",
+        "    tool = ComputerTool(computer)\n",
+        "    llm = LocalLLM()\n",
+        "    agent = ComputerAgent(llm, tool, max_trajectory_budget=4)\n",
+        "    messages=[{\"role\":\"user\",\"content\":\"Open mail, read inbox subjects, and summarize.\"}]\n",
+        "    async for result in agent.run(messages):\n",
+        "        print(\"==== STREAM RESULT ====\")\n",
+        "        for event in result[\"output\"]:\n",
+        "            if event[\"type\"]==\"computer_call\":\n",
+        "                a = event.get(\"action\",{})\n",
+        "                print(f\"[TOOL CALL] {a.get('type')} -> {a.get('text')} [{event.get('status')}]\")\n",
+        "            if event[\"type\"]==\"computer_call_output\":\n",
+        "                snap = event[\"output\"][\"image_url\"]\n",
+        "                print(\"SCREEN AFTER ACTION:\\n\", snap[:400],\"...\\n\")\n",
+        "            if event[\"type\"]==\"message\":\n",
+        "                print(\"ASSISTANT:\", event[\"content\"][0][\"text\"], \"\\n\")\n",
+        "        print(\"USAGE:\", result[\"usage\"])\n",
+        "\n",
+        "loop = asyncio.get_event_loop()\n",
+        "loop.run_until_complete(main_demo())"
+      ]
+    }
+  ]
+}