1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " code" ,
19+ "source" : [
20+ " !pip -q install faiss-cpu transformers==4.44.2 accelerate sentence-transformers==3.0.1\n " ,
21+ " \n " ,
22+ " from typing import List, Dict, Tuple\n " ,
23+ " import re, textwrap, numpy as np, torch\n " ,
24+ " from sentence_transformers import SentenceTransformer\n " ,
25+ " import faiss\n " ,
26+ " from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM\n " ,
27+ " \n " ,
28+ " GEN_MODEL = \" google/flan-t5-base\"\n " ,
29+ " EMB_MODEL = \" sentence-transformers/all-MiniLM-L6-v2\"\n " ,
30+ " \n " ,
31+ " gen_tok = AutoTokenizer.from_pretrained(GEN_MODEL)\n " ,
32+ " gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL, device_map=\" auto\" )\n " ,
33+ " generate = pipeline(\" text2text-generation\" , model=gen_model, tokenizer=gen_tok)\n " ,
34+ " \n " ,
35+ " emb_device = \" cuda\" if torch.cuda.is_available() else \" cpu\"\n " ,
36+ " emb_model = SentenceTransformer(EMB_MODEL, device=emb_device)"
37+ ],
38+ "metadata" : {
39+ "id" : " FH7IWEP1easp"
40+ },
41+ "execution_count" : 3 ,
42+ "outputs" : []
43+ },
44+ {
45+ "cell_type" : " code" ,
46+ "source" : [
47+ " DOCS = [\n " ,
48+ " {\" id\" :\" policy_sec_001\" ,\" title\" :\" Data Security Policy\" ,\n " ,
49+ " \" text\" :\" All customer data must be encrypted at rest (AES-256) and in transit (TLS 1.2+). Access is role-based (RBAC). Secrets are stored in a managed vault. Backups run nightly with 35-day retention. PII includes name, email, phone, address, PAN/Aadhaar.\" },\n " ,
50+ " {\" id\" :\" policy_ai_002\" ,\" title\" :\" Responsible AI Guidelines\" ,\n " ,
51+ " \" text\" :\" Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days.\" },\n " ,
52+ " {\" id\" :\" runbook_inc_003\" ,\" title\" :\" Incident Response Runbook\" ,\n " ,
53+ " \" text\" :\" If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only.\" },\n " ,
54+ " {\" id\" :\" sop_sales_004\" ,\" title\" :\" Sales SOP - Enterprise Deals\" ,\n " ,
55+ " \" text\" :\" For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\" }\n " ,
56+ " ]\n " ,
57+ " \n " ,
58+ " def chunk(text:str, chunk_size=600, overlap=80):\n " ,
59+ " w = text.split()\n " ,
60+ " if len(w) <= chunk_size: return [text]\n " ,
61+ " out=[]; i=0\n " ,
62+ " while i < len(w):\n " ,
63+ " j=min(i+chunk_size, len(w)); out.append(\" \" .join(w[i:j]))\n " ,
64+ " if j==len(w): break\n " ,
65+ " i = j - overlap\n " ,
66+ " return out\n " ,
67+ " \n " ,
68+ " CORPUS=[]\n " ,
69+ " for d in DOCS:\n " ,
70+ " for i,c in enumerate(chunk(d[\" text\" ])):\n " ,
71+ " CORPUS.append({\" doc_id\" :d[\" id\" ],\" title\" :d[\" title\" ],\" chunk_id\" :i,\" text\" :c})"
72+ ],
73+ "metadata" : {
74+ "id" : " HAP1DuGneant"
75+ },
76+ "execution_count" : 4 ,
77+ "outputs" : []
78+ },
79+ {
80+ "cell_type" : " code" ,
81+ "source" : [
82+ " def build_index(chunks:List[Dict]) -> Tuple[faiss.IndexFlatIP, np.ndarray]:\n " ,
83+ " vecs = emb_model.encode([c[\" text\" ] for c in chunks], normalize_embeddings=True, convert_to_numpy=True)\n " ,
84+ " index = faiss.IndexFlatIP(vecs.shape[1]); index.add(vecs); return index, vecs\n " ,
85+ " \n " ,
86+ " INDEX, VECS = build_index(CORPUS)\n " ,
87+ " \n " ,
88+ " PII_PATTERNS = [\n " ,
89+ " (re.compile(r\"\\ b\\ d{10}\\ b\" ), \" <REDACTED_PHONE>\" ),\n " ,
90+ " (re.compile(r\"\\ b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\ .[A-Z]{2,}\\ b\" , re.I), \" <REDACTED_EMAIL>\" ),\n " ,
91+ " (re.compile(r\"\\ b\\ d{12}\\ b\" ), \" <REDACTED_ID12>\" ),\n " ,
92+ " (re.compile(r\"\\ b[A-Z]{5}\\ d{4}[A-Z]\\ b\" ), \" <REDACTED_PAN>\" )\n " ,
93+ " ]\n " ,
94+ " def redact(t:str)->str:\n " ,
95+ " for p,r in PII_PATTERNS: t = p.sub(r, t)\n " ,
96+ " return t\n " ,
97+ " \n " ,
98+ " POLICY_DISALLOWED = [\n " ,
99+ " re.compile(r\"\\ b(share|exfiltrate)\\ b.*\\ b(raw|all)\\ b.*\\ bdata\\ b\" , re.I),\n " ,
100+ " re.compile(r\"\\ bdisable\\ b.*\\ bencryption\\ b\" , re.I),\n " ,
101+ " ]\n " ,
102+ " def policy_check(q:str):\n " ,
103+ " for r in POLICY_DISALLOWED:\n " ,
104+ " if r.search(q): return False, \" Request violates security policy (data exfiltration/encryption tampering).\"\n " ,
105+ " return True, \"\" "
106+ ],
107+ "metadata" : {
108+ "id" : " rDGxC7I5eak0"
109+ },
110+ "execution_count" : 5 ,
111+ "outputs" : []
112+ },
113+ {
114+ "cell_type" : " code" ,
115+ "source" : [
116+ " def retrieve(query:str, k=4)->List[Dict]:\n " ,
117+ " qv = emb_model.encode([query], normalize_embeddings=True, convert_to_numpy=True)\n " ,
118+ " scores, idxs = INDEX.search(qv, k)\n " ,
119+ " return [{**CORPUS[i], \" score\" : float(s)} for s,i in zip(scores[0], idxs[0])]\n " ,
120+ " \n " ,
121+ " SYSTEM = (\" You are an enterprise AI assistant.\\ n\"\n " ,
122+ " \" - Answer strictly from the provided CONTEXT.\\ n\"\n " ,
123+ " \" - If missing info, say what is unknown and suggest the correct policy/runbook.\\ n\"\n " ,
124+ " \" - Keep it concise and cite titles + doc_ids inline like [Title (doc_id:chunk)].\" )\n " ,
125+ " def build_prompt(user_q:str, ctx_blocks:List[Dict])->str:\n " ,
126+ " ctx = \"\\ n\\ n\" .join(f\" [{i+1}] {b['title']} (doc:{b['doc_id']}:{b['chunk_id']})\\ n{b['text']}\" for i,b in enumerate(ctx_blocks))\n " ,
127+ " uq = redact(user_q)\n " ,
128+ " return f\" SYSTEM:\\ n{SYSTEM}\\ n\\ nCONTEXT:\\ n{ctx}\\ n\\ nUSER QUESTION:\\ n{uq}\\ n\\ nINSTRUCTIONS:\\ n- Cite sources inline.\\ n- Keep to 5-8 sentences.\\ n- Preserve redactions.\"\n " ,
129+ " \n " ,
130+ " def answer(user_q:str, k=4, max_new_tokens=220)->Dict:\n " ,
131+ " ok,msg = policy_check(user_q)\n " ,
132+ " if not ok: return {\" answer\" : f\" ❌ {msg}\" , \" ctx\" :[]}\n " ,
133+ " ctx = retrieve(user_q, k=k); prompt = build_prompt(user_q, ctx)\n " ,
134+ " out = generate(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0][\" generated_text\" ].strip()\n " ,
135+ " return {\" answer\" : out, \" ctx\" : ctx}"
136+ ],
137+ "metadata" : {
138+ "id" : " RaWPdi1jeafq"
139+ },
140+ "execution_count" : 6 ,
141+ "outputs" : []
142+ },
143+ {
144+ "cell_type" : " code" ,
145+ "execution_count" : 7 ,
146+ "metadata" : {
147+ "colab" : {
148+ "base_uri" : " https://localhost:8080/"
149+ },
150+ "id" : " CNm09M8HcqFy" ,
151+ "outputId" : " 85d33ac3-ffc9-4faa-d3aa-715609ec595b"
152+ },
153+ "outputs" : [
154+ {
155+ "output_type" : " stream" ,
156+ "name" : " stdout" ,
157+ "text" : [
158+ " \n " ,
159+ " ====================================================================================================\n " ,
160+ " Q: What encryption and backup rules do we follow for customer data?\n " ,
161+ " \n " ,
162+ " A: Cite sources inline.\n " ,
163+ " \n " ,
164+ " Retrieved Context (top 3):\n " ,
165+ " - Data Security Policy [policy_sec_001:0] score=0.583\n " ,
166+ " - Responsible AI Guidelines [policy_ai_002:0] score=0.295\n " ,
167+ " - Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.267\n " ,
168+ " Eval: {'terms': 7, 'hits': 3, 'hit_rate': 0.43}\n " ,
169+ " \n " ,
170+ " ====================================================================================================\n " ,
171+ " Q: Can we auto-answer RFP security questionnaires? What should we cite?\n " ,
172+ " \n " ,
173+ " A: Cite sources inline.\n " ,
174+ " \n " ,
175+ " Retrieved Context (top 3):\n " ,
176+ " - Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.663\n " ,
177+ " - Responsible AI Guidelines [policy_ai_002:0] score=0.243\n " ,
178+ " - Incident Response Runbook [runbook_inc_003:0] score=0.194\n " ,
179+ " Eval: {'terms': 7, 'hits': 1, 'hit_rate': 0.14}\n " ,
180+ " \n " ,
181+ " ====================================================================================================\n " ,
182+ " Q: If there is a suspected breach, what are the first three steps?\n " ,
183+ " \n " ,
184+ " A: Incident Response Runbook (doc:runbook_inc_003:0) If a suspected breach occurs, page on-call SecOps. Rotate keys, isolate affected services, perform forensic capture, notify DPO within regulatory SLA. Communicate via the incident room only. [2] Responsible AI Guidelines (doc:policy_ai_002:0) Use internal models for confidential data. Retrieval sources must be logged. No customer decisioning without human-in-the-loop. Redact PII in prompts and outputs. All model prompts and outputs are stored for audit for 180 days. [3] Sales SOP - Enterprise Deals (doc:sop_sales_004:0) For RFPs, use the approved security questionnaire responses. Claims must match policy_sec_001. Custom clauses need Legal sign-off. Keep records in CRM with deal room links.\n " ,
185+ " \n " ,
186+ " Retrieved Context (top 3):\n " ,
187+ " - Incident Response Runbook [runbook_inc_003:0] score=0.549\n " ,
188+ " - Responsible AI Guidelines [policy_ai_002:0] score=0.234\n " ,
189+ " - Sales SOP - Enterprise Deals [sop_sales_004:0] score=0.226\n " ,
190+ " Eval: {'terms': 7, 'hits': 2, 'hit_rate': 0.29}\n " ,
191+ " \n " ,
192+ " ====================================================================================================\n " ,
193+ " Q: Is it allowed to share all raw customer data externally for testing?\n " ,
194+ " \n " ,
195+ " A: ❌ Request violates security policy (data exfiltration/encryption tampering).\n "
196+ ]
197+ }
198+ ],
199+ "source" : [
200+ " def eval_query(user_q:str, ctx:List[Dict])->Dict:\n " ,
201+ " terms = [w.lower() for w in re.findall(r\" [a-zA-Z]{4,}\" , user_q)]\n " ,
202+ " ctx_text = \" \" .join(c[\" text\" ].lower() for c in ctx)\n " ,
203+ " hits = sum(t in ctx_text for t in terms)\n " ,
204+ " return {\" terms\" : len(terms), \" hits\" : hits, \" hit_rate\" : round(hits/max(1,len(terms)), 2)}\n " ,
205+ " \n " ,
206+ " QUERIES = [\n " ,
207+ " \" What encryption and backup rules do we follow for customer data?\" ,\n " ,
208+ " \" Can we auto-answer RFP security questionnaires? What should we cite?\" ,\n " ,
209+ " \" If there is a suspected breach, what are the first three steps?\" ,\n " ,
210+ " \" Is it allowed to share all raw customer data externally for testing?\"\n " ,
211+ " ]\n " ,
212+ " for q in QUERIES:\n " ,
213+ " res = answer(q, k=3)\n " ,
214+ " print(\"\\ n\" + \" =\" *100); print(\" Q:\" , q); print(\"\\ nA:\" , res[\" answer\" ])\n " ,
215+ " if res[\" ctx\" ]:\n " ,
216+ " ev = eval_query(q, res[\" ctx\" ]); print(\"\\ nRetrieved Context (top 3):\" )\n " ,
217+ " for r in res[\" ctx\" ]: print(f\" - {r['title']} [{r['doc_id']}:{r['chunk_id']}] score={r['score']:.3f}\" )\n " ,
218+ " print(\" Eval:\" , ev)"
219+ ]
220+ }
221+ ]
222+ }
0 commit comments