RAG From Scratch

Section 1

The Five Steps

Chunking with overlap, batched embeddings, cosine similarity. The full math fits in 20 lines.

python

from openai import OpenAI
import numpy as np

client = OpenAI()

def chunk(text: str, size: int = 400, overlap: int = 50) -> list[str]:
    words = text.split()
    chunks: list[str] = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i:i+size]))
        i += size - overlap
    return chunks

def embed(texts: list[str]) -> np.ndarray:
    r = client.embeddings.create(model="text-embedding-3-small", input=texts)
    return np.array([d.embedding for d in r.data], dtype=np.float32)

def cosine(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    a_n = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_n = b / np.linalg.norm(b)
    return a_n @ b_n

Embed once at startup, search in-memory, ground the prompt in retrieved chunks. Good enough for 10k-chunk corpora.

python

DOC = open("handbook.txt", encoding="utf-8").read()
CHUNKS = chunk(DOC)
MATRIX = embed(CHUNKS)

def answer(question: str, k: int = 4) -> str:
    q_vec = embed([question])[0]
    scores = cosine(MATRIX, q_vec)
    top = np.argsort(-scores)[:k]
    context = "\n\n---\n\n".join(CHUNKS[i] for i in top)

    r = client.responses.create(
        model="gpt-5",
        input=[
            {"role": "system", "content": "Answer only from the provided context. If unsure, say you don't know."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"},
        ],
    )
    return r.output_text

print(answer("What is the PTO policy?"))

Key terms in this lesson

The Five Steps

Curious about “RAG From Scratch”?

Keep going

RAG From Scratch

The Five Steps

Curious about “RAG From Scratch”?

Keep going