Skip to content

Commit 08eb217

Browse files
authored
Add files via upload
1 parent 4e118bf commit 08eb217

1 file changed

Lines changed: 222 additions & 0 deletions

File tree

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"source": [
20+
"!pip -q install faiss-cpu transformers==4.44.2 accelerate sentence-transformers==3.0.1\n",
21+
"\n",
22+
"from typing import List, Dict, Tuple\n",
23+
"import re, textwrap, numpy as np, torch\n",
24+
"from sentence_transformers import SentenceTransformer\n",
25+
"import faiss\n",
26+
"from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM\n",
27+
"\n",
28+
"GEN_MODEL = \"google/flan-t5-base\"\n",
29+
"EMB_MODEL = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
30+
"\n",
31+
"gen_tok = AutoTokenizer.from_pretrained(GEN_MODEL)\n",
32+
"gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL, device_map=\"auto\")\n",
33+
"generate = pipeline(\"text2text-generation\", model=gen_model, tokenizer=gen_tok)\n",
34+
"\n",
35+
"emb_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
36+
"emb_model = SentenceTransformer(EMB_MODEL, device=emb_device)"
37+
],
38+
"metadata": {
39+
"id": "FH7IWEP1easp"
40+
},
41+
"execution_count": 3,
42+
"outputs": []
43+
},
44+
{
45+
"cell_type": "code",
46+
"source": [
47+
"DOCS = [\n",
48+
" {\"id\":\"policy_sec_001\",\"title\":\"Data Security Policy\",\n",
49+
" \"text\":\"All customer data must be encrypted at rest (AES-256) and in transit (TLS 1.2+). Access is role-based (RBAC). Secrets are stored in a managed vault. Backups run nightly with 35-day retention. PII includes name, email, phone, address, PAN/Aadhaar.\"},\n",
50+
" {\"id\":\"policy_ai_002\",\"title\":\"Responsible AI Guidelines\",\n",
51+
" \"text\":\"Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days.\"},\n",
52+
" {\"id\":\"runbook_inc_003\",\"title\":\"Incident Response Runbook\",\n",
53+
" \"text\":\"If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only.\"},\n",
54+
" {\"id\":\"sop_sales_004\",\"title\":\"Sales SOP - Enterprise Deals\",\n",
55+
" \"text\":\"For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\"}\n",
56+
"]\n",
57+
"\n",
58+
"def chunk(text:str, chunk_size=600, overlap=80):\n",
59+
" w = text.split()\n",
60+
" if len(w) <= chunk_size: return [text]\n",
61+
" out=[]; i=0\n",
62+
" while i < len(w):\n",
63+
" j=min(i+chunk_size, len(w)); out.append(\" \".join(w[i:j]))\n",
64+
" if j==len(w): break\n",
65+
" i = j - overlap\n",
66+
" return out\n",
67+
"\n",
68+
"CORPUS=[]\n",
69+
"for d in DOCS:\n",
70+
" for i,c in enumerate(chunk(d[\"text\"])):\n",
71+
" CORPUS.append({\"doc_id\":d[\"id\"],\"title\":d[\"title\"],\"chunk_id\":i,\"text\":c})"
72+
],
73+
"metadata": {
74+
"id": "HAP1DuGneant"
75+
},
76+
"execution_count": 4,
77+
"outputs": []
78+
},
79+
{
80+
"cell_type": "code",
81+
"source": [
82+
"def build_index(chunks:List[Dict]) -> Tuple[faiss.IndexFlatIP, np.ndarray]:\n",
83+
" vecs = emb_model.encode([c[\"text\"] for c in chunks], normalize_embeddings=True, convert_to_numpy=True)\n",
84+
" index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs); return index, vecs\n",
85+
"\n",
86+
"INDEX, VECS = build_index(CORPUS)\n",
87+
"\n",
88+
"PII_PATTERNS = [\n",
89+
" (re.compile(r\"\\b\\d{10}\\b\"), \"<REDACTED_PHONE>\"),\n",
90+
" (re.compile(r\"\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b\", re.I), \"<REDACTED_EMAIL>\"),\n",
91+
" (re.compile(r\"\\b\\d{12}\\b\"), \"<REDACTED_ID12>\"),\n",
92+
" (re.compile(r\"\\b[A-Z]{5}\\d{4}[A-Z]\\b\"), \"<REDACTED_PAN>\")\n",
93+
"]\n",
94+
"def redact(t:str)->str:\n",
95+
" for p,r in PII_PATTERNS: t = p.sub(r, t)\n",
96+
" return t\n",
97+
"\n",
98+
"POLICY_DISALLOWED = [\n",
99+
" re.compile(r\"\\b(share|exfiltrate)\\b.*\\b(raw|all)\\b.*\\bdata\\b\", re.I),\n",
100+
" re.compile(r\"\\bdisable\\b.*\\bencryption\\b\", re.I),\n",
101+
"]\n",
102+
"def policy_check(q:str):\n",
103+
" for r in POLICY_DISALLOWED:\n",
104+
" if r.search(q): return False, \"Request violates security policy (data exfiltration/encryption tampering).\"\n",
105+
" return True, \"\""
106+
],
107+
"metadata": {
108+
"id": "rDGxC7I5eak0"
109+
},
110+
"execution_count": 5,
111+
"outputs": []
112+
},
113+
{
114+
"cell_type": "code",
115+
"source": [
116+
"def retrieve(query:str, k=4)->List[Dict]:\n",
117+
" qv = emb_model.encode([query], normalize_embeddings=True, convert_to_numpy=True)\n",
118+
" scores, idxs = INDEX.search(qv, k)\n",
119+
" return [{**CORPUS[i], \"score\": float(s)} for s,i in zip(scores[0], idxs[0])]\n",
120+
"\n",
121+
"SYSTEM = (\"You are an enterprise AI assistant.\\n\"\n",
122+
" \"- Answer strictly from the provided CONTEXT.\\n\"\n",
123+
" \"- If missing info, say what is unknown and suggest the correct policy/runbook.\\n\"\n",
124+
" \"- Keep it concise and cite titles + doc_ids inline like [Title (doc_id:chunk)].\")\n",
125+
"def build_prompt(user_q:str, ctx_blocks:List[Dict])->str:\n",
126+
" ctx = \"\\n\\n\".join(f\"[{i+1}] {b['title']} (doc:{b['doc_id']}:{b['chunk_id']})\\n{b['text']}\" for i,b in enumerate(ctx_blocks))\n",
127+
" uq = redact(user_q)\n",
128+
" return f\"SYSTEM:\\n{SYSTEM}\\n\\nCONTEXT:\\n{ctx}\\n\\nUSER QUESTION:\\n{uq}\\n\\nINSTRUCTIONS:\\n- Cite sources inline.\\n- Keep to 5-8 sentences.\\n- Preserve redactions.\"\n",
129+
"\n",
130+
"def answer(user_q:str, k=4, max_new_tokens=220)->Dict:\n",
131+
" ok,msg = policy_check(user_q)\n",
132+
" if not ok: return {\"answer\": f\"❌ {msg}\", \"ctx\":[]}\n",
133+
" ctx = retrieve(user_q, k=k); prompt = build_prompt(user_q, ctx)\n",
134+
" out = generate(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0][\"generated_text\"].strip()\n",
135+
" return {\"answer\": out, \"ctx\": ctx}"
136+
],
137+
"metadata": {
138+
"id": "RaWPdi1jeafq"
139+
},
140+
"execution_count": 6,
141+
"outputs": []
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 7,
146+
"metadata": {
147+
"colab": {
148+
"base_uri": "https://localhost:8080/"
149+
},
150+
"id": "CNm09M8HcqFy",
151+
"outputId": "85d33ac3-ffc9-4faa-d3aa-715609ec595b"
152+
},
153+
"outputs": [
154+
{
155+
"output_type": "stream",
156+
"name": "stdout",
157+
"text": [
158+
"\n",
159+
"====================================================================================================\n",
160+
"Q: What encryption and backup rules do we follow for customer data?\n",
161+
"\n",
162+
"A: Cite sources inline.\n",
163+
"\n",
164+
"Retrieved Context (top 3):\n",
165+
"- Data Security Policy [policy_sec_001:0] score=0.583\n",
166+
"- Responsible AI Guidelines [policy_ai_002:0] score=0.295\n",
167+
"- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.267\n",
168+
"Eval: {'terms': 7, 'hits': 3, 'hit_rate': 0.43}\n",
169+
"\n",
170+
"====================================================================================================\n",
171+
"Q: Can we auto-answer RFP security questionnaires? What should we cite?\n",
172+
"\n",
173+
"A: Cite sources inline.\n",
174+
"\n",
175+
"Retrieved Context (top 3):\n",
176+
"- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.663\n",
177+
"- Responsible AI Guidelines [policy_ai_002:0] score=0.243\n",
178+
"- Incident Response Runbook [runbook_inc_003:0] score=0.194\n",
179+
"Eval: {'terms': 7, 'hits': 1, 'hit_rate': 0.14}\n",
180+
"\n",
181+
"====================================================================================================\n",
182+
"Q: If there is a suspected breach, what are the first three steps?\n",
183+
"\n",
184+
"A: Incident Response Runbook (doc:runbook_inc_003:0) If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only. [2] Responsible AI Guidelines (doc:policy_ai_002:0) Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days. [3] Sales SOP - Enterprise Deals (doc:sop_sales_004:0) For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\n",
185+
"\n",
186+
"Retrieved Context (top 3):\n",
187+
"- Incident Response Runbook [runbook_inc_003:0] score=0.549\n",
188+
"- Responsible AI Guidelines [policy_ai_002:0] score=0.234\n",
189+
"- Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.226\n",
190+
"Eval: {'terms': 7, 'hits': 2, 'hit_rate': 0.29}\n",
191+
"\n",
192+
"====================================================================================================\n",
193+
"Q: Is it allowed to share all raw customer data externally for testing?\n",
194+
"\n",
195+
"A: ❌ Request violates security policy (data exfiltration/encryption tampering).\n"
196+
]
197+
}
198+
],
199+
"source": [
200+
"def eval_query(user_q:str, ctx:List[Dict])->Dict:\n",
201+
" terms = [w.lower() for w in re.findall(r\"[a-zA-Z]{4,}\", user_q)]\n",
202+
" ctx_text = \" \".join(c[\"text\"].lower() for c in ctx)\n",
203+
" hits = sum(t in ctx_text for t in terms)\n",
204+
" return {\"terms\": len(terms), \"hits\": hits, \"hit_rate\": round(hits/max(1,len(terms)), 2)}\n",
205+
"\n",
206+
"QUERIES = [\n",
207+
" \"What encryption and backup rules do we follow for customer data?\",\n",
208+
" \"Can we auto-answer RFP security questionnaires? What should we cite?\",\n",
209+
" \"If there is a suspected breach, what are the first three steps?\",\n",
210+
" \"Is it allowed to share all raw customer data externally for testing?\"\n",
211+
"]\n",
212+
"for q in QUERIES:\n",
213+
" res = answer(q, k=3)\n",
214+
" print(\"\\n\" + \"=\"*100); print(\"Q:\", q); print(\"\\nA:\", res[\"answer\"])\n",
215+
" if res[\"ctx\"]:\n",
216+
" ev = eval_query(q, res[\"ctx\"]); print(\"\\nRetrieved Context (top 3):\")\n",
217+
" for r in res[\"ctx\"]: print(f\"- {r['title']} [{r['doc_id']}:{r['chunk_id']}] score={r['score']:.3f}\")\n",
218+
" print(\"Eval:\", ev)"
219+
]
220+
}
221+
]
222+
}

0 commit comments

Comments
 (0)