from langchain_community.document_loaders import TextLoader from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter
# Configure text splitter text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# Split documents into chunks texts = text_splitter.split_documents(documents)
# # Set up the embedding model embeddings_model = OpenAIEmbeddings( model="bge-m3", base_url='http://localhost:9997/v1', api_key='cannot be empty', # dimensions=1024, )
# Create FAISS index from documents and set up retriever retriever = FAISS.from_documents(texts, embeddings_model).as_retriever( search_kwargs={"k": 30} )
# Define the query query = "Can you tell me about Word2Vec?"
# Execute the query and retrieve results docs = retriever.invoke(query)
# Display the retrieved documents for i, d inenumerate(docs): print(f"document {i+1}:\n\n" + d.page_content) print('-' * 100)
from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import CrossEncoderReranker from langchain_community.cross_encoders import HuggingFaceCrossEncoder
# Initialize the model model = HuggingFaceCrossEncoder( model_name="../DataCollection/officials/bge-reranker-v2-m3", model_kwargs = {'device': 'cuda:6'} )
# Select the top 3 documents compressor = CrossEncoderReranker(model=model, top_n=3)