1. 环境准备
1.1 依赖安装
# 安装LlamaIndex核心包pipinstallllama-index# 安装多语言相关依赖pipinstallllama-index-embeddings-huggingface pipinstallsentence-transformers# 安装语言检测工具pipinstalllangdetect# 或pipinstalllingua# 安装向量数据库(可选,根据需要选择)pipinstallchromadb# 轻量级本地向量数据库# 或pipinstallqdrant-client# 生产级向量数据库# 安装其他工具pipinstallpython-dotenv# 环境变量管理pipinstallpydantic# 数据验证1.2 环境配置
创建.env文件:
# LLM API Keys(根据需要选择) OPENAI_API_KEY=your-openai-api-key # 或 ANTHROPIC_API_KEY=your-anthropic-api-key # 或 LLAMA_CLOUD_API_KEY=your-llama-cloud-api-key # 向量数据库配置(如果使用远程数据库) QDRANT_HOST=localhost QDRANT_PORT=63331.3 项目结构
multilingual-rag/ ├── src/ │ ├── __init__.py │ ├── language_detector.py # 语言检测模块 │ ├── multilingual_embedder.py # 多语言Embedding模块 │ ├── query_router.py # 查询路由模块 │ ├── multilingual_rag.py # 主RAG类 │ └── utils.py # 工具函数 ├── data/ # 文档目录 │ ├── chinese/ │ ├── english/ │ └── mixed/ ├── examples/ # 示例代码 │ ├── basic_example.py │ ├── advanced_example.py │ └── batch_processing.py ├── tests/ # 测试文件 ├── .env # 环境变量 ├── requirements.txt # 依赖列表 └── README.md2. LlamaIndex多语言RAG基础实现
2.1 基础多语言RAG实现
使用LlamaIndex实现基础的多语言RAG系统:
fromllama_index.coreimportVectorStoreIndex,SimpleDirectoryReader,Settingsfromllama_index.embeddings.huggingfaceimportHuggingFaceEmbeddingfromllama_index.llms.openaiimportOpenAIfromdotenvimportload_dotenvimportos# 加载环境变量load_dotenv()# 配置多语言Embedding模型embed_model=HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")# 配置LLMllm=OpenAI(model="gpt-4o-mini",api_key=os.getenv("OPENAI_API_KEY"))# 设置全局配置Settings.embed_model=embed_model Settings.llm=llm# 加载多语言文档documents=SimpleDirectoryReader("data").load_data()# 创建向量索引index=VectorStoreIndex.from_documents(documents)# 创建查询引擎query_engine=index.as_query_engine()# 多语言查询示例queries=["What is LlamaIndex?",# 英文"LlamaIndex是什么?",# 中文"¿Qué es LlamaIndex?",# 西班牙文]forqueryinqueries:response=query_engine.query(query)print(f"Query:{query}")print(f"Response:{response}\n")2.2 使用LlamaIndex最新API
根据LlamaIndex最新版本(v0.10+),使用正确的导入方式:
fromllama_index.coreimport(VectorStoreIndex,SimpleDirectoryReader,Settings,StorageContext,load_index_from_storage,)fromllama_index.embeddings.huggingfaceimportHuggingFaceEmbeddingfromllama_index.llms.openaiimportOpenAIfromllama_index.vector_stores.chromaimportChromaVectorStoreimportchromadb# 初始化多语言Embeddingembed_model=HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",device="cpu"# 或 "cuda" 如果有GPU)# 配置LLMllm=OpenAI(model="gpt-4o-mini",temperature=0.1)Settings.embed_model=embed_model Settings.llm=llm# 使用Chroma作为向量存储chroma_client=chromadb.Client()collection=chroma_client.get_or_create_collection("multilingual_rag")vector_store=ChromaVectorStore(chroma_collection=collection)storage_context=StorageContext.from_defaults(vector_store=vector_store)# 加载文档documents=SimpleDirectoryReader("data").load_data()# 创建索引index=VectorStoreIndex.from_documents(documents,storage_context=storage_context)# 持久化索引index.storage_context.persist(persist_dir="./storage")# 查询query_engine=index.as_query_engine()response=query_engine.query("Explain RAG in Chinese")print(response)3. 多语言Embedding模型集成
3.1 支持的多语言Embedding模型
fromllama_index.embeddings.huggingfaceimportHuggingFaceEmbedding# 模型配置字典EMBEDDING_MODELS={"multilingual-e5-large":{"model_name":"intfloat/multilingual-e5-large","description":"性能最优,支持100+语言","recommended":True},"multilingual-e5-base":{"model_name":"intfloat/multilingual-e5-base","description":"性能与速度平衡","recommended":True},"paraphrase-multilingual":{"model_name":"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","description":"轻量级,速度快","recommended":False},"bge-m3":{"model_name":"BAAI/bge-m3","description":"中文优化,支持多粒度","recommended":True}}defcreate_embed_model(model_key:str="multilingual-e5-base",device:str="cpu"):"""创建多语言Embedding模型"""ifmodel_keynotinEMBEDDING_MODELS:raiseValueError(f"Unknown model:{model_key}")config=EMBEDDING_MODELS[model_key]returnHuggingFaceEmbedding(model_name=config["model_name"],device=device)# 使用示例embed_model=create_embed_model("multilingual-e5-base",device="cuda")3.2 自定义Embedding模型
fromllama_index.core.embeddingsimportBaseEmbeddingfromtypingimportListimporttorchfromsentence_transformersimportSentenceTransformerclassCustomMultilingualEmbedding(BaseEmbedding):"""自定义多语言Embedding类"""def__init__(self,model_name:str,device:str="cpu"):super().__init__()self.model=SentenceTransformer(model_name,device=device)self._model_name=model_namedef_get_query_embedding(self,query:str)->List[float]:"""获取查询的Embedding"""embedding=self.model.encode(query,convert_to_numpy=True)returnembedding.tolist()def_get_text_embeddings(self,texts:List[str])->List[List[float]]:"""批量获取文本Embedding"""embeddings=self.model.encode(texts,convert_to_numpy=True)returnembeddings.tolist()@propertydefdimension(self)->int:"""返回Embedding维度"""returnself.model.get_sentence_embedding_dimension()# 使用自定义Embeddingcustom_embed=CustomMultilingualEmbedding(model_name="intfloat/multilingual-e5-large",device="cuda")3.3 Embedding模型性能对比
importtimefromtypingimportListdefbenchmark_embedding_model(embed_model:BaseEmbedding,texts:List[str]):"""基准测试Embedding模型性能"""# 测试查询Embeddingstart=time.time()query_emb=embed_model.get_query_embedding(texts[0])query_time=time.time()-start# 测试批量Embeddingstart=time.time()text_embs=embed_model.get_text_embeddings(texts)batch_time=time.time()-startprint(f"Model:{embed_model._model_name}")print(f"Query embedding time:{query_time:.4f}s")print(f"Batch embedding time ({len(texts)}texts):{batch_time:.4f}s")print(f"Average time per text:{batch_time/len(texts):.4f}s")print(f"Embedding dimension:{embed_model.dimension}")print("-"*50)# 测试不同模型test_texts=["Hello, how are you?","你好,最近怎么样?","Bonjour, comment allez-vous?","Hola, ¿cómo estás?",]models_to_test=[create_embed_model("multilingual-e5-base"),create_embed_model("paraphrase-multilingual"),]formodelinmodels_to_test:benchmark_embedding_model(model,test_texts)4. 语言检测与路由实现
4.1 语言检测实现
fromlangdetectimportdetect,detect_langs,LangDetectExceptionfromlinguaimportLanguage,LanguageDetectorBuilderfromtypingimportOptional,Dict,ListclassLanguageDetector:"""语言检测器"""def__init__(self,method:str="langdetect"):""" Args: method: 检测方法,可选 "langdetect" 或 "lingua" """self.method=methodifmethod=="lingua":self.detector=LanguageDetectorBuilder.from_all_languages().build()else:self.detector=Nonedefdetect(self,text:str)->Dict[str,any]:""" 检测文本语言 Returns: { "language": "zh" or "en" etc, "confidence": 0.0-1.0, "is_reliable": True/False } """ifself.method=="lingua":returnself._detect_lingua(text)else:returnself._detect_langdetect(text)def_detect_langdetect(self,text:str)->Dict[str,any]:"""使用langdetect检测"""try:# 检测主要语言main_lang=detect(text)# 获取所有语言及其置信度lang_confidence=detect_langs(text)confidence=lang_confidence[0].probiflang_confidenceelse0.5return{"language":main_lang,"confidence":confidence,"is_reliable":confidence>0.7,"all_languages":[{"lang":l.lang,"prob":l.prob}forlinlang_confidence[:3]]}exceptLangDetectException:return{"language":"unknown","confidence":0.0,"is_reliable":False,"all_languages":[]}def_detect_lingua(self,text:str)->Dict[str,any]:"""使用lingua检测"""detected_language=self.detector.detect_language_of(text)ifdetected_languageisNone:return{"language":"unknown","confidence":0.0,"is_reliable":False,"all_languages":[]}# lingua不直接提供置信度,使用compute_language_confidenceconfidence_values=self.detector.compute_language_confidence_values(text)return{"language":detected_language.iso_code_639_1.name.lower(),"confidence":confidence_values.get(detected_language,0.0),"is_reliable":confidence_values.get(detected_language,0.0)>0.7,"all_languages":[{"lang":lang.iso_code_639_1.name.lower(),"prob":prob}forlang,probinlist(confidence_values.items())[:3]]}defis_multilingual(self,text:str,threshold:float=0.3)->bool:"""判断文本是否包含多种语言"""lang_info=self.detect(text)iflen(lang_info["all_languages"])<2:returnFalse# 如果第二语言的置信度超过阈值,认为是多语言iflen(lang_info["all_languages"])>=2:second_lang_prob=lang_info["all_languages"][1]["prob"]returnsecond_lang_prob>thresholdreturnFalse# 使用示例detector=LanguageDetector(method="langdetect")test_texts=["Hello, how are you?","你好,最近怎么样?","This is a test. 这是一个测试。",]fortextintest_texts:result=detector.detect(text)print(f"Text:{text}")print(f"Language:{result['language']}, Confidence:{result['confidence']:.2f}")print(f"Is Multilingual:{detector.is_multilingual(text)}\n")4.2 查询路由实现
fromtypingimportLiteral,Optionalfromllama_index.coreimportVectorStoreIndex,QueryBundlefromllama_index.core.query_engineimportBaseQueryEngineclassQueryRouter:"""查询路由器,根据语言选择不同的处理策略"""def__init__(self,multilingual_index:VectorStoreIndex,language_specific_indices:Optional[Dict[str,VectorStoreIndex]]=None,default_strategy:Literal["multilingual","language_specific"]="multilingual"):""" Args: multilingual_index: 多语言统一索引 language_specific_indices: 语言特定索引字典,如 {"zh": index_zh, "en": index_en} default_strategy: 默认策略 """self.multilingual_index=multilingual_index self.language_specific_indices=language_specific_indicesor{}self.default_strategy=default_strategy self.detector=LanguageDetector()defroute_query(self,query:str)->BaseQueryEngine:""" 根据查询语言路由到对应的查询引擎 Returns: QueryEngine实例 """# 检测查询语言lang_info=self.detector.detect(query)detected_lang=lang_info["language"]confidence=lang_info["confidence"]# 如果置信度低,使用多语言策略ifconfidence<0.7ordetected_lang=="unknown":returnself.multilingual_index.as_query_engine()# 如果有该语言的特定索引,使用特定索引ifdetected_langinself.language_specific_indices:returnself.language_specific_indices[detected_lang].as_query_engine()# 否则使用多语言索引returnself.multilingual_index.as_query_engine()defquery(self,query:str)->str:"""执行查询"""query_engine=self.route_query(query)response=query_engine.query(query)returnstr(response)# 使用示例router=QueryRouter(multilingual_index=index,language_specific_indices={"zh":chinese_index,# 假设已创建"en":english_index,# 假设已创建})response=router.query("What is RAG?")print(response)5. 完整实现示例
5.1 完整的多语言RAG类
fromllama_index.coreimport(VectorStoreIndex,SimpleDirectoryReader,Settings,StorageContext,Document,)fromllama_index.embeddings.huggingfaceimportHuggingFaceEmbeddingfromllama_index.llms.openaiimportOpenAIfromllama_index.vector_stores.chromaimportChromaVectorStoreimportchromadbfromtypingimportList,Optional,Dictimportosfromdotenvimportload_dotenv load_dotenv()classMultilingualRAG:"""多语言RAG系统"""def__init__(self,embed_model_name:str="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",llm_model:str="gpt-4o-mini",persist_dir:Optional[str]="./storage",use_chroma:bool=True,):""" 初始化多语言RAG系统 Args: embed_model_name: Embedding模型名称 llm_model: LLM模型名称 persist_dir: 持久化目录 use_chroma: 是否使用Chroma向量数据库 """# 初始化Embedding模型self.embed_model=HuggingFaceEmbedding(model_name=embed_model_name,device="cpu")# 初始化LLMself.llm=OpenAI(model=llm_model,api_key=os.getenv("OPENAI_API_KEY"))# 设置全局配置Settings.embed_model=self.embed_model Settings.llm=self.llm# 初始化向量存储self.persist_dir=persist_dirifuse_chroma:chroma_client=chromadb.Client()collection=chroma_client.get_or_create_collection("multilingual_rag")self.vector_store=ChromaVectorStore(chroma_collection=collection)self.storage_context=StorageContext.from_defaults(vector_store=self.vector_store)else:self.storage_context=None# 初始化语言检测器self.detector=LanguageDetector()# 初始化索引self.index=Nonedefload_documents(self,data_dir:str,recursive:bool=True,required_exts:Optional[List[str]]=None):"""加载文档"""reader=SimpleDirectoryReader(input_dir=data_dir,recursive=recursive,required_exts=required_exts)documents=reader.load_data()returndocumentsdefbuild_index(self,documents:List[Document],rebuild:bool=False):"""构建索引"""# 如果存在持久化索引且不重建,则加载ifnotrebuildandself.persist_dirandos.path.exists(self.persist_dir):try:fromllama_index.coreimportload_index_from_storage self.index=load_index_from_storage(StorageContext.from_defaults(persist_dir=self.persist_dir))print(f"Loaded index from{self.persist_dir}")returnexceptExceptionase:print(f"Failed to load index:{e}, rebuilding...")# 构建新索引ifself.storage_context:self.index=VectorStoreIndex.from_documents(documents,storage_context=self.storage_context)else:self.index=VectorStoreIndex.from_documents(documents)# 持久化索引ifself.persist_dir:self.index.storage_context.persist(persist_dir=self.persist_dir)print(f"Index persisted to{self.persist_dir}")defquery(self,query_text:str,similarity_top_k:int=5,response_mode:str="compact")->str:""" 执行查询 Args: query_text: 查询文本 similarity_top_k: 检索的文档数量 response_mode: 响应模式,可选 "compact", "tree_summarize", "refine" Returns: 查询结果 """ifself.indexisNone:raiseValueError("Index not built. Call build_index() first.")# 检测查询语言lang_info=self.detector.detect(query_text)print(f"Detected language:{lang_info['language']}"f"(confidence:{lang_info['confidence']:.2f})")# 创建查询引擎query_engine=self.index.as_query_engine(similarity_top_k=similarity_top_k,response_mode=response_mode)# 执行查询response=query_engine.query(query_text)returnstr(response)defadd_documents(self,documents:List[Document]):"""增量添加文档"""ifself.indexisNone:raiseValueError("Index not built. Call build_index() first.")# 添加文档到索引fordocindocuments:self.index.insert(doc)# 更新持久化ifself.persist_dir:self.index.storage_context.persist(persist_dir=self.persist_dir)# 使用示例defmain():# 初始化RAG系统rag=MultilingualRAG(embed_model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",llm_model="gpt-4o-mini",persist_dir="./storage")# 加载文档documents=rag.load_documents("data")# 构建索引rag.build_index(documents,rebuild=False)# 执行多语言查询queries=["What is RAG?","什么是RAG?","¿Qué es RAG?",]forqueryinqueries:print(f"\n{'='*50}")print(f"Query:{query}")print(f"{'='*50}")response=rag.query(query)print(f"Response:{response}\n")if__name__=="__main__":main()5.2 批量处理实现
fromtypingimportListimportasynciofromllama_index.core.async_utilsimportasyncio_runclassBatchMultilingualRAG(MultilingualRAG):"""支持批量处理的多语言RAG"""asyncdefasync_query(self,query_text:str)->str:"""异步查询"""ifself.indexisNone:raiseValueError("Index not built.")query_engine=self.index.as_query_engine()response=awaitquery_engine.aquery(query_text)returnstr(response)asyncdefbatch_query(self,queries:List[str])->List[str]:"""批量查询"""tasks=[self.async_query(query)forqueryinqueries]responses=awaitasyncio.gather(*tasks)returnresponsesdefquery_batch(self,queries:List[str])->List[str]:"""同步批量查询(内部使用异步)"""returnasyncio_run(self.batch_query(queries))# 使用示例asyncdefbatch_example():rag=BatchMultilingualRAG()documents=rag.load_documents("data")rag.build_index(documents)queries=["What is machine learning?","什么是机器学习?","Qu'est-ce que l'apprentissage automatique?",]responses=awaitrag.batch_query(queries)forquery,responseinzip(queries,responses):print(f"Q:{query}")print(f"A:{response}\n")# 运行asyncio.run(batch_example())6. 高级功能实现
6.1 混合检索实现(向量+关键词)
fromllama_index.coreimportVectorStoreIndex,KeywordTableIndexfromllama_index.core.query_engineimportRouterQueryEnginefromllama_index.core.selectorsimportLLMSingleSelectorfromllama_index.core.toolsimportQueryEngineToolclassHybridMultilingualRAG:"""混合检索多语言RAG"""def__init__(self,documents,embed_model,llm):self.embed_model=embed_model self.llm=llm# 创建向量索引self.vector_index=VectorStoreIndex.from_documents(documents)# 创建关键词索引self.keyword_index=KeywordTableIndex.from_documents(documents)# 创建查询引擎工具vector_tool=QueryEngineTool.from_defaults(query_engine=self.vector_index.as_query_engine(),description="Useful for semantic similarity search")keyword_tool=QueryEngineTool.from_defaults(query_engine=self.keyword_index.as_query_engine(),description="Useful for exact keyword matching")# 创建路由查询引擎self.query_engine=RouterQueryEngine.from_defaults(selector=LLMSingleSelector.from_defaults(llm=llm),query_engine_tools=[vector_tool,keyword_tool])defquery(self,query_text:str)->str:"""执行混合检索查询"""response=self.query_engine.query(query_text)returnstr(response)6.2 重排序实现
fromllama_index.core.postprocessorimportSentenceTransformerRerankfromllama_index.coreimportVectorStoreIndex,QueryBundleclassRerankedMultilingualRAG:"""带重排序的多语言RAG"""def__init__(self,index:VectorStoreIndex,top_k:int=10,rerank_top_n:int=3):self.index=index self.top_k=top_k self.rerank_top_n=rerank_top_n# 初始化重排序器self.reranker=SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-6-v2",top_n=rerank_top_n)defquery(self,query_text:str)->str:"""执行带重排序的查询"""# 创建查询引擎,返回更多结果用于重排序query_engine=self.index.as_query_engine(similarity_top_k=self.top_k,node_postprocessors=[self.reranker])response=query_engine.query(query_text)returnstr(response)6.3 跨语言检索实现
classCrossLingualRAG(MultilingualRAG):"""跨语言RAG,支持用A语言查询B语言文档"""def__init__(self,*args,enable_translation:bool=False,**kwargs):super().__init__(*args,**kwargs)self.enable_translation=enable_translationdefquery_cross_lingual(self,query_text:str,target_languages:Optional[List[str]]=None)->Dict[str,any]:""" 跨语言查询 Args: query_text: 查询文本 target_languages: 目标语言列表,如果为None则检索所有语言 Returns: 包含检索结果和语言信息的字典 """# 检测查询语言query_lang=self.detector.detect(query_text)["language"]# 执行检索retriever=self.index.as_retriever(similarity_top_k=10)nodes=retriever.retrieve(query_text)# 按语言分组结果results_by_lang={}fornodeinnodes:# 检测节点语言(可以从元数据获取,或检测文本)node_text=node.get_content()node_lang=self.detector.detect(node_text)["language"]iftarget_languagesandnode_langnotintarget_languages:continueifnode_langnotinresults_by_lang:results_by_lang[node_lang]=[]results_by_lang[node_lang].append(node)# 生成回答query_engine=self.index.as_query_engine()response=query_engine.query(query_text)return{"query_language":query_lang,"response":str(response),"results_by_language":{lang:[n.get_content()[:200]forninnodes]forlang,nodesinresults_by_lang.items()}}7. 实战案例
7.1 案例一:多语言技术文档问答系统
""" 场景:技术文档包含中英文,用户可以用任意语言查询 """deftech_doc_qa_example():# 初始化rag=MultilingualRAG(embed_model_name="intfloat/multilingual-e5-base",llm_model="gpt-4o-mini")# 加载技术文档documents=rag.load_documents("data/tech_docs")# 构建索引rag.build_index(documents)# 测试查询test_cases=[{"query":"How to use async/await in Python?","expected_lang":"en"},{"query":"Python中如何使用async/await?","expected_lang":"zh"},{"query":"What is the difference between list and tuple?","expected_lang":"en"},{"query":"列表和元组的区别是什么?","expected_lang":"zh"}]forcaseintest_cases:response=rag.query(case["query"])print(f"Query ({case['expected_lang']}):{case['query']}")print(f"Response:{response}\n")tech_doc_qa_example()7.2 案例二:多语言客服知识库
""" 场景:客服知识库包含多语言FAQ,需要快速准确回答用户问题 """defcustomer_service_example():rag=MultilingualRAG(embed_model_name="BAAI/bge-m3",# 中文优化模型llm_model="gpt-4o-mini")# 加载FAQ文档documents=rag.load_documents("data/faq")# 构建索引rag.build_index(documents)# 创建带重排序的查询引擎reranked_rag=RerankedMultilingualRAG(index=rag.index,top_k=10,rerank_top_n=3)# 用户查询user_queries=["How to reset password?",# 英文"如何重置密码?",# 中文"退款流程是什么?",# 中文"What is the refund process?",# 英文]forqueryinuser_queries:response=reranked_rag.query(query)print(f"Q:{query}")print(f"A:{response}\n")customer_service_example()7.3 案例三:学术论文多语言检索
""" 场景:学术论文库包含多语言论文,研究人员需要跨语言检索 """defacademic_paper_example():rag=CrossLingualRAG(embed_model_name="intfloat/multilingual-e5-large",llm_model="gpt-4o-mini",enable_translation=True)# 加载论文documents=rag.load_documents("data/papers")# 构建索引rag.build_index(documents)# 跨语言查询示例query="机器学习在自然语言处理中的应用"# 中文查询result=rag.query_cross_lingual(query,target_languages=["en","zh"]# 检索中英文论文)print(f"Query Language:{result['query_language']}")print(f"Response:{result['response']}")print("\nResults by Language:")forlang,textsinresult['results_by_language'].items():print(f"\n{lang}:")fori,textinenumerate(texts[:3],1):print(f"{i}.{text}...")academic_paper_example()8. 性能优化实现
8.1 缓存实现
fromfunctoolsimportlru_cacheimporthashlibimportjsonclassCachedMultilingualRAG(MultilingualRAG):"""带缓存的多语言RAG"""def__init__(self,*args,cache_size:int=100,**kwargs):super().__init__(*args,**kwargs)self.cache={}self.cache_size=cache_sizedef_get_cache_key(self,query:str)->str:"""生成缓存键"""returnhashlib.md5(query.encode()).hexdigest()defquery(self,query_text:str,use_cache:bool=True,**kwargs)->str:"""带缓存的查询"""ifuse_cache:cache_key=self._get_cache_key(query_text)ifcache_keyinself.cache:print("Cache hit!")returnself.cache[cache_key]# 执行查询response=super().query(query_text,**kwargs)# 更新缓存ifuse_cacheandlen(self.cache)<self.cache_size:cache_key=self._get_cache_key(query_text)self.cache[cache_key]=responsereturnresponse8.2 异步优化
importasynciofromllama_index.core.async_utilsimportrun_async_tasksclassAsyncMultilingualRAG(MultilingualRAG):"""异步多语言RAG"""asyncdefasync_build_index(self,documents):"""异步构建索引"""ifself.storage_context:self.index=awaitVectorStoreIndex.afrom_documents(documents,storage_context=self.storage_context)else:self.index=awaitVectorStoreIndex.afrom_documents(documents)asyncdefasync_query(self,query_text:str,**kwargs)->str:"""异步查询"""ifself.indexisNone:raiseValueError("Index not built.")query_engine=self.index.as_query_engine(**kwargs)response=awaitquery_engine.aquery(query_text)returnstr(response)9. 测试与评估
9.1 单元测试
importunittestclassTestMultilingualRAG(unittest.TestCase):"""多语言RAG测试类"""defsetUp(self):"""测试前准备"""self.rag=MultilingualRAG(embed_model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")# 创建测试文档fromllama_index.coreimportDocument self.test_docs=[Document(text="Python is a programming language.",metadata={"lang":"en"}),Document(text="Python是一种编程语言。",metadata={"lang":"zh"}),]self.rag.build_index(self.test_docs)deftest_language_detection(self):"""测试语言检测"""detector=LanguageDetector()result=detector.detect("Hello, world!")self.assertEqual(result["language"],"en")self.assertGreater(result["confidence"],0.7)deftest_multilingual_query(self):"""测试多语言查询"""response_en=self.rag.query("What is Python?")self.assertIsInstance(response_en,str)self.assertGreater(len(response_en),0)response_zh=self.rag.query("Python是什么?")self.assertIsInstance(response_zh,str)self.assertGreater(len(response_zh),0)deftest_cross_lingual_retrieval(self):"""测试跨语言检索"""ifisinstance(self.rag,CrossLingualRAG):result=self.rag.query_cross_lingual("What is Python?")self.assertIn("query_language",result)self.assertIn("response",result)if__name__=="__main__":unittest.main()9.2 性能评估
importtimefromtypingimportList,Dictdefevaluate_rag_performance(rag:MultilingualRAG,test_queries:List[Dict[str,str]])->Dict[str,float]:"""评估RAG系统性能"""results={"total_queries":len(test_queries),"total_time":0,"avg_time":0,"successful_queries":0,}start_time=time.time()forquery_infointest_queries:try:query_start=time.time()response=rag.query(query_info["query"])query_time=time.time()-query_start results["total_time"]+=query_time results["successful_queries"]+=1print(f"Query:{query_info['query']}")print(f"Time:{query_time:.2f}s")print(f"Response length:{len(response)}chars\n")exceptExceptionase:print(f"Error processing query:{e}")results["avg_time"]=results["total_time"]/results["total_queries"]results["total_time"]=time.time()-start_time results["success_rate"]=results["successful_queries"]/results["total_queries"]returnresults# 使用示例test_queries=[{"query":"What is RAG?","lang":"en"},{"query":"什么是RAG?","lang":"zh"},{"query":"Explain machine learning","lang":"en"},]rag=MultilingualRAG()documents=rag.load_documents("data")rag.build_index(documents)performance=evaluate_rag_performance(rag,test_queries)print("\nPerformance Summary:")forkey,valueinperformance.items():print(f"{key}:{value}")10. 总结
本实现指南提供了使用LlamaIndex构建多语言RAG系统的完整代码示例,包括:
- 基础实现:使用多语言Embedding模型的基本RAG系统
- 语言检测:集成语言检测功能
- 路由机制:根据语言选择不同的处理策略
- 高级功能:混合检索、重排序、跨语言检索
- 实战案例:技术文档、客服系统、学术检索等场景
- 性能优化:缓存、异步处理等优化技术
- 测试评估:单元测试和性能评估方法
关键要点:
- 选择合适的多语言Embedding模型
- 实现可靠的语言检测机制
- 根据场景选择合适的路由策略
- 持续优化和评估系统性能