Add files via upload

Marktechpost · web-flow · commit 08eb2170cde6 · 2025-10-22T19:05:12.000-07:00
diff --git a/RAG/enterprise_ai_rag_guardrails_Marktechpost.ipynb b/RAG/enterprise_ai_rag_guardrails_Marktechpost.ipynb
@@ -0,0 +1,222 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip -q install faiss-cpu transformers==4.44.2 accelerate sentence-transformers==3.0.1\n",
+        "\n",
+        "from typing import List, Dict, Tuple\n",
+        "import re, textwrap, numpy as np, torch\n",
+        "from sentence_transformers import SentenceTransformer\n",
+        "import faiss\n",
+        "from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM\n",
+        "\n",
+        "GEN_MODEL = \"google/flan-t5-base\"\n",
+        "EMB_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+        "\n",
+        "gen_tok = AutoTokenizer.from_pretrained(GEN_MODEL)\n",
+        "gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL, device_map=\"auto\")\n",
+        "generate = pipeline(\"text2text-generation\", model=gen_model, tokenizer=gen_tok)\n",
+        "\n",
+        "emb_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+        "emb_model = SentenceTransformer(EMB_MODEL, device=emb_device)"
+      ],
+      "metadata": {
+        "id": "FH7IWEP1easp"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "DOCS = [\n",
+        "  {\"id\":\"policy_sec_001\",\"title\":\"Data Security Policy\",\n",
+        "   \"text\":\"All customer data must be encrypted at rest (AES-256) and in transit (TLS 1.2+). Access is role-based (RBAC). Secrets are stored in a managed vault. Backups run nightly with 35-day retention. PII includes name, email, phone, address, PAN/Aadhaar.\"},\n",
+        "  {\"id\":\"policy_ai_002\",\"title\":\"Responsible AI Guidelines\",\n",
+        "   \"text\":\"Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days.\"},\n",
+        "  {\"id\":\"runbook_inc_003\",\"title\":\"Incident Response Runbook\",\n",
+        "   \"text\":\"If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only.\"},\n",
+        "  {\"id\":\"sop_sales_004\",\"title\":\"Sales SOP - Enterprise Deals\",\n",
+        "   \"text\":\"For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\"}\n",
+        "]\n",
+        "\n",
+        "def chunk(text:str, chunk_size=600, overlap=80):\n",
+        "    w = text.split()\n",
+        "    if len(w) <= chunk_size: return [text]\n",
+        "    out=[]; i=0\n",
+        "    while i < len(w):\n",
+        "        j=min(i+chunk_size, len(w)); out.append(\" \".join(w[i:j]))\n",
+        "        if j==len(w): break\n",
+        "        i = j - overlap\n",
+        "    return out\n",
+        "\n",
+        "CORPUS=[]\n",
+        "for d in DOCS:\n",
+        "    for i,c in enumerate(chunk(d[\"text\"])):\n",
+        "        CORPUS.append({\"doc_id\":d[\"id\"],\"title\":d[\"title\"],\"chunk_id\":i,\"text\":c})"
+      ],
+      "metadata": {
+        "id": "HAP1DuGneant"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def build_index(chunks:List[Dict]) -> Tuple[faiss.IndexFlatIP, np.ndarray]:\n",
+        "    vecs = emb_model.encode([c[\"text\"] for c in chunks], normalize_embeddings=True, convert_to_numpy=True)\n",
+        "    index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs); return index, vecs\n",
+        "\n",
+        "INDEX, VECS = build_index(CORPUS)\n",
+        "\n",
+        "PII_PATTERNS = [\n",
+        "    (re.compile(r\"\\b\\d{10}\\b\"), \"<REDACTED_PHONE>\"),\n",
+        "    (re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b\", re.I), \"<REDACTED_EMAIL>\"),\n",
+        "    (re.compile(r\"\\b\\d{12}\\b\"), \"<REDACTED_ID12>\"),\n",
+        "    (re.compile(r\"\\b[A-Z]{5}\\d{4}[A-Z]\\b\"), \"<REDACTED_PAN>\")\n",
+        "]\n",
+        "def redact(t:str)->str:\n",
+        "    for p,r in PII_PATTERNS: t = p.sub(r, t)\n",
+        "    return t\n",
+        "\n",
+        "POLICY_DISALLOWED = [\n",
+        "    re.compile(r\"\\b(share|exfiltrate)\\b.*\\b(raw|all)\\b.*\\bdata\\b\", re.I),\n",
+        "    re.compile(r\"\\bdisable\\b.*\\bencryption\\b\", re.I),\n",
+        "]\n",
+        "def policy_check(q:str):\n",
+        "    for r in POLICY_DISALLOWED:\n",
+        "        if r.search(q): return False, \"Request violates security policy (data exfiltration/encryption tampering).\"\n",
+        "    return True, \"\""
+      ],
+      "metadata": {
+        "id": "rDGxC7I5eak0"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def retrieve(query:str, k=4)->List[Dict]:\n",
+        "    qv = emb_model.encode([query], normalize_embeddings=True, convert_to_numpy=True)\n",
+        "    scores, idxs = INDEX.search(qv, k)\n",
+        "    return [{**CORPUS[i], \"score\": float(s)} for s,i in zip(scores[0], idxs[0])]\n",
+        "\n",
+        "SYSTEM = (\"You are an enterprise AI assistant.\\n\"\n",
+        "          \"- Answer strictly from the provided CONTEXT.\\n\"\n",
+        "          \"- If missing info, say what is unknown and suggest the correct policy/runbook.\\n\"\n",
+        "          \"- Keep it concise and cite titles + doc_ids inline like [Title (doc_id:chunk)].\")\n",
+        "def build_prompt(user_q:str, ctx_blocks:List[Dict])->str:\n",
+        "    ctx = \"\\n\\n\".join(f\"[{i+1}] {b['title']} (doc:{b['doc_id']}:{b['chunk_id']})\\n{b['text']}\" for i,b in enumerate(ctx_blocks))\n",
+        "    uq = redact(user_q)\n",
+        "    return f\"SYSTEM:\\n{SYSTEM}\\n\\nCONTEXT:\\n{ctx}\\n\\nUSER QUESTION:\\n{uq}\\n\\nINSTRUCTIONS:\\n- Cite sources inline.\\n- Keep to 5-8 sentences.\\n- Preserve redactions.\"\n",
+        "\n",
+        "def answer(user_q:str, k=4, max_new_tokens=220)->Dict:\n",
+        "    ok,msg = policy_check(user_q)\n",
+        "    if not ok: return {\"answer\": f\"❌ {msg}\", \"ctx\":[]}\n",
+        "    ctx = retrieve(user_q, k=k); prompt = build_prompt(user_q, ctx)\n",
+        "    out = generate(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0][\"generated_text\"].strip()\n",
+        "    return {\"answer\": out, \"ctx\": ctx}"
+      ],
+      "metadata": {
+        "id": "RaWPdi1jeafq"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "CNm09M8HcqFy",
+        "outputId": "85d33ac3-ffc9-4faa-d3aa-715609ec595b"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "====================================================================================================\n",
+            "Q: What encryption and backup rules do we follow for customer data?\n",
+            "\n",
+            "A: Cite sources inline.\n",
+            "\n",
+            "Retrieved Context (top 3):\n",
+            "- Data Security Policy [policy_sec_001:0] score=0.583\n",
+            "- Responsible AI Guidelines [policy_ai_002:0] score=0.295\n",
+            "- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.267\n",
+            "Eval: {'terms': 7, 'hits': 3, 'hit_rate': 0.43}\n",
+            "\n",
+            "====================================================================================================\n",
+            "Q: Can we auto-answer RFP security questionnaires? What should we cite?\n",
+            "\n",
+            "A: Cite sources inline.\n",
+            "\n",
+            "Retrieved Context (top 3):\n",
+            "- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.663\n",
+            "- Responsible AI Guidelines [policy_ai_002:0] score=0.243\n",
+            "- Incident Response Runbook [runbook_inc_003:0] score=0.194\n",
+            "Eval: {'terms': 7, 'hits': 1, 'hit_rate': 0.14}\n",
+            "\n",
+            "====================================================================================================\n",
+            "Q: If there is a suspected breach, what are the first three steps?\n",
+            "\n",
+            "A: Incident Response Runbook (doc:runbook_inc_003:0) If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only. [2] Responsible AI Guidelines (doc:policy_ai_002:0) Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days. [3] Sales SOP - Enterprise Deals (doc:sop_sales_004:0) For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\n",
+            "\n",
+            "Retrieved Context (top 3):\n",
+            "- Incident Response Runbook [runbook_inc_003:0] score=0.549\n",
+            "- Responsible AI Guidelines [policy_ai_002:0] score=0.234\n",
+            "- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.226\n",
+            "Eval: {'terms': 7, 'hits': 2, 'hit_rate': 0.29}\n",
+            "\n",
+            "====================================================================================================\n",
+            "Q: Is it allowed to share all raw customer data externally for testing?\n",
+            "\n",
+            "A: ❌ Request violates security policy (data exfiltration/encryption tampering).\n"
+          ]
+        }
+      ],
+      "source": [
+        "def eval_query(user_q:str, ctx:List[Dict])->Dict:\n",
+        "    terms = [w.lower() for w in re.findall(r\"[a-zA-Z]{4,}\", user_q)]\n",
+        "    ctx_text = \" \".join(c[\"text\"].lower() for c in ctx)\n",
+        "    hits = sum(t in ctx_text for t in terms)\n",
+        "    return {\"terms\": len(terms), \"hits\": hits, \"hit_rate\": round(hits/max(1,len(terms)), 2)}\n",
+        "\n",
+        "QUERIES = [\n",
+        "    \"What encryption and backup rules do we follow for customer data?\",\n",
+        "    \"Can we auto-answer RFP security questionnaires? What should we cite?\",\n",
+        "    \"If there is a suspected breach, what are the first three steps?\",\n",
+        "    \"Is it allowed to share all raw customer data externally for testing?\"\n",
+        "]\n",
+        "for q in QUERIES:\n",
+        "    res = answer(q, k=3)\n",
+        "    print(\"\\n\" + \"=\"*100); print(\"Q:\", q); print(\"\\nA:\", res[\"answer\"])\n",
+        "    if res[\"ctx\"]:\n",
+        "        ev = eval_query(q, res[\"ctx\"]); print(\"\\nRetrieved Context (top 3):\")\n",
+        "        for r in res[\"ctx\"]: print(f\"- {r['title']} [{r['doc_id']}:{r['chunk_id']}] score={r['score']:.3f}\")\n",
+        "        print(\"Eval:\", ev)"
+      ]
+    }
+  ]
+}