RAG 系统设计:从 naive 到 production-ready
RAG 的入门门槛很低,但要做好很难。大多数教程止步于"向量数据库 + LLM",而生产级 RAG 的核心是分块策略、召回质量和重排序。
为什么需要 RAG
LLM 存在三个核心局限:
- 知识截止:训练数据有时间限制,无法获取最新信息
- 私有数据:企业内部文档、代码库、知识库 LLM 没见过
- 幻觉问题:无相关知识时模型会"编"答案
RAG(Retrieval-Augmented Generation)通过在推理时动态检索相关文档,将这些内容注入到上下文中,让模型基于事实回答。
RAG 全链路
文档 -> [处理] -> [分块] -> [Embedding] -> [向量数据库]
|
用户问题 -> [查询改写] -> [检索] -> [重排序] -> [LLM生成] -> 回答
阶段一:文档处理
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re
def clean_document(text: str) -> str:
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'- \d+ -', '', text)
return text.strip()
阶段二:分块策略
分块是 RAG 质量的关键决策,没有银弹。
固定大小分块(Fixed-size):
splitter = RecursiveCharacterTextSplitter(
chunk_size=512, # token 数
chunk_overlap=64, # 重叠区域
separators=["\n\n", "\n", "。", ".", " ", ""]
)
chunks = splitter.split_documents(docs)
语义分块(Semantic Chunking):
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
splitter = SemanticChunker(
OpenAIEmbeddings(),
breakpoint_threshold_type="percentile",
breakpoint_threshold_amount=95
)
层级分块(Hierarchical):
文档
├── 章节摘要(大块,用于召回范围定位)
│ ├── 段落(中块,主要检索单元)
│ │ └── 句子(小块,精确引用)
chunk_overlap 的作用:
- 防止关键信息被切割在两个 chunk 的边界
- 通常设为 chunk_size 的 10-20%
- 代价是存储增加和潜在重复检索
选择建议:
| 场景 | 推荐方案 |
|---|---|
| 通用文档 | RecursiveCharacterTextSplitter,512 tokens,overlap 64 |
| 代码 | 按函数/类分割,保留完整语义单元 |
| 结构化文档(FAQ) | 按问答对分割 |
| 多语言 | 字符级分割而非 token 级 |
阶段三:Embedding 模型选择
# OpenAI text-embedding-3-small(性价比高)
from openai import OpenAI
client = OpenAI()
response = client.embeddings.create(
model="text-embedding-3-small", # 1536 维,$0.02/1M tokens
input=["your text here"]
)
embedding = response.data[0].embedding
# BGE(BAAI,开源,中文效果好)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embedding = model.encode("你的文本", normalize_embeddings=True)
| 模型 | 维度 | 特点 |
|---|---|---|
| text-embedding-3-small | 1536 | 便宜,效果好 |
| text-embedding-3-large | 3072 | 更准确,贵 3x |
| BGE-large-zh | 1024 | 中文最强开源 |
| E5-mistral-7b | 4096 | 英文 SOTA,本地运行 |
阶段四:向量数据库
# Qdrant(推荐,功能强,开源可自部署)
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
client = QdrantClient(url="http://localhost:6333")
client.create_collection(
collection_name="docs",
vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
points = [
PointStruct(id=i, vector=emb, payload={"text": chunk, "source": "doc.pdf"})
for i, (emb, chunk) in enumerate(zip(embeddings, chunks))
]
client.upsert(collection_name="docs", points=points)
-- pgvector(已有 PostgreSQL 的首选)
CREATE EXTENSION vector;
CREATE TABLE embeddings (
id SERIAL PRIMARY KEY,
content TEXT,
embedding vector(1536),
metadata JSONB
);
CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops);
| 数据库 | 适合场景 |
|---|---|
| Qdrant | 生产部署,功能丰富 |
| pgvector | 已用 PostgreSQL,简化架构 |
| Chroma | 本地开发,简单易用 |
| Pinecone | 全托管,无运维 |
| Weaviate | 需要混合检索(向量+BM25) |
阶段五:检索策略
稠密检索(Dense Retrieval):
query_embedding = embed(user_query)
results = qdrant_client.search(
collection_name="docs",
query_vector=query_embedding,
limit=10
)
稀疏检索(BM25):
from rank_bm25 import BM25Okapi
corpus = [chunk.split() for chunk in all_chunks]
bm25 = BM25Okapi(corpus)
scores = bm25.get_scores(query.split())
混合检索(Hybrid Search):
def hybrid_search(query: str, alpha: float = 0.5):
"""alpha=1: 纯向量, alpha=0: 纯BM25"""
dense_results = dense_search(query, top_k=20)
sparse_results = bm25_search(query, top_k=20)
# RRF (Reciprocal Rank Fusion) 融合
scores = {}
k = 60
for rank, doc in enumerate(dense_results):
scores[doc.id] = scores.get(doc.id, 0) + alpha / (rank + k)
for rank, doc in enumerate(sparse_results):
scores[doc.id] = scores.get(doc.id, 0) + (1-alpha) / (rank + k)
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
阶段六:重排序(Reranker)
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('BAAI/bge-reranker-large')
def rerank(query: str, candidates: list, top_k: int = 3):
pairs = [(query, doc) for doc in candidates]
scores = reranker.predict(pairs)
ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
return [doc for doc, _ in ranked[:top_k]]
为什么需要重排序:
- 向量检索:快,但精度受限于 embedding 质量
- Cross-encoder:慢(O(n)),但精度高
- 典型做法:向量检索 top-50,cross-encoder 重排取 top-3
阶段七:LLM 生成
def generate_answer(query: str, contexts: list) -> str:
context_str = "\n\n---\n\n".join(contexts)
prompt = f"""基于以下参考资料回答问题。如果参考资料中没有相关信息,
请直接说明,不要编造内容。
参考资料:
{context_str}
问题:{query}
回答:"""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.1
)
return response.choices[0].message.content
评估指标
# RAGAS 框架(专为 RAG 评估设计)
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
from datasets import Dataset
data = {
"question": ["什么是 RAG?"],
"answer": ["RAG 是检索增强生成..."],
"contexts": [["RAG 全称 Retrieval-Augmented Generation..."]],
"ground_truth": ["RAG 是一种结合检索和生成的 AI 技术"]
}
result = evaluate(Dataset.from_dict(data), metrics=[
faithfulness, answer_relevancy, context_recall, context_precision
])
关键指标:
- Recall@K:Ground truth 文档在 top-K 检索结果中出现的比例
- MRR(Mean Reciprocal Rank):首个相关文档排名的倒数均值
- Faithfulness:答案与检索到的上下文的一致性
常见工程坑
1. 噪声文档污染:
def quality_filter(doc: str) -> bool:
if len(doc) < 100:
return False
if doc.count('\n') / len(doc) > 0.3:
return False
return True
2. 召回与上下文窗口冲突:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def fit_to_context(docs: list, max_tokens: int = 3000) -> list:
result, total = [], 0
for doc in docs:
tokens = len(enc.encode(doc))
if total + tokens > max_tokens:
break
result.append(doc)
total += tokens
return result
3. Embedding 缓存:
import hashlib, redis, json
cache = redis.Redis()
def cached_embed(text: str) -> list:
key = f"embed:{hashlib.md5(text.encode()).hexdigest()}"
cached = cache.get(key)
if cached:
return json.loads(cached)
embedding = embed_api(text)
cache.setex(key, 86400, json.dumps(embedding)) # 24h TTL
return embedding
生产级优化
查询改写(Query Rewriting):
def rewrite_query(original_query: str) -> list:
prompt = f"""为以下问题生成 3 个不同的搜索查询,覆盖不同表述方式:
问题:{original_query}
输出 JSON 数组,每个元素是一个查询字符串。"""
variants = llm.json_complete(prompt)
return [original_query] + variants
HyDE(Hypothetical Document Embedding):
def hyde_search(query: str) -> list:
hypothetical = llm.complete(
f"写一段简短的文字来回答:{query}(即使你不确定,也要给出合理的内容)"
)
hyde_embedding = embed(hypothetical)
return vector_search(hyde_embedding, top_k=10)
多路召回合并:
from concurrent.futures import ThreadPoolExecutor, as_completed
def multi_recall(query: str) -> list:
results = []
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(dense_search, query): "dense",
executor.submit(bm25_search, query): "bm25",
executor.submit(hyde_search, query): "hyde",
}
for future in as_completed(futures):
results.append((futures[future], future.result()))
return rrf_fusion(results)
架构总结
+-----------------+
用户问题 ---------->| 查询改写 |
+--------+--------+
| 多个变体
+--------------+--------------+
v v v
稠密检索 BM25 HyDE
+--------------+--------------+
| RRF 融合 top-50
v
+-----------------+
| Cross-encoder | 重排序 top-3
| Reranker |
+--------+--------+
|
v
+-----------------+
| LLM 生成 | temperature=0.1
+--------+--------+
|
最终答案
RAG 的核心不是技术选型,而是数据质量 + 评估体系。没有好的评估,你不知道改动是否真的有效。