# Set desired model openai_embedding = OpenAIEmbeddings( model="bge-m3", base_url='http://localhost:9997/v1', api_key='cannot be empty', # dimensions=1024, )
query_vector = openai_embedding.embed_query("What is the Open AI's gpt embedding model?")
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
query = "How does AI improve healthcare?"
# Various embedding models documents = [ "AI helps doctors diagnose diseases faster, improving patient outcomes.", "AI can analyze medical images to detect conditions like cancer.", "Machine learning predicts patient outcomes based on health data.", "AI speeds up drug discovery by predicting the effectiveness of compounds.", "AI monitors patients remotely, enabling proactive care for chronic diseases.", "AI automates administrative tasks, saving time for healthcare workers.", "NLP extracts insights from electronic health records for better care.", "AI chatbots help with patient assessments and symptom checking.", "AI improves drug manufacturing, ensuring better quality and efficiency.", "AI optimizes hospital operations and reduces healthcare costs." ]
defembed_query(self, text: str) -> List[float]: """Call out to OpenAI's embedding endpoint for embedding query text. Args: text: The text to embed. Returns: Embedding for the text. """ returnself.embed_documents([text])[0]
# Sorting by in descending order sorted_idx = similarity.argsort()[0][::-1]
# Display top 3 and bottom 3 documents based on similarity print("query: ", query) print("Top 3 most similar document:") for i inrange(0, 3): print( f"[{i+1}] similarity: {similarity[0][sorted_idx[i]]:.3f} | {documents[sorted_idx[i]]}" )
print("\nBottom 3 least similar documents:") for i inrange(1, 4): print( f"[{i}] similarity: {similarity[0][sorted_idx[-i]]:.3f} | {documents[sorted_idx[-i]]}" )
query: How does AI improve healthcare?
Top 3 most similar document:
[1] similarity: 0.641 | AI monitors patients remotely, enabling proactive care for chronic diseases.
[2] similarity: 0.596 | AI chatbots help with patient assessments and symptom checking.
[3] similarity: 0.592 | AI automates administrative tasks, saving time for healthcare workers.
Bottom 3 least similar documents:
[1] similarity: 0.432 | Machine learning predicts patient outcomes based on health data.
[2] similarity: 0.485 | AI optimizes hospital operations and reduces healthcare costs.
[3] similarity: 0.503 | NLP extracts insights from electronic health records for better care.
defsearch_similar_documents(q, docs, hf_embeddings): """ Search for the most relevant documents based on a query using text embeddings. Args: q (str): The query string for which relevant documents are to be found. docs (list of str): A list of document strings to compare against the query. hf_embeddings: An embedding model object with `embed_query` and `embed_documents` methods. Returns: tuple: - embedded_query (numpy.ndarray): The embedding vector of the query. - embedded_documents (numpy.ndarray): The embedding matrix of the documents. """ # Embed the query and documents using the embedding model embedded_query = hf_embeddings.embed_query(q) embedded_documents = hf_embeddings.embed_documents(docs)
# Calculate similarity scores using dot product and normalize with the magnitudes query_norm = np.linalg.norm(embedded_query) document_norms = np.linalg.norm(embedded_documents, axis=1)
import matplotlib.pyplot as plt from sklearn.decomposition import PCA import numpy as np
# Combine documents and query for PCA all_vectors = np.vstack([docs_vector, query_vector]) # Stack query vector with docs pca = PCA(n_components=2) reduced_vectors = pca.fit_transform(all_vectors)
# Separate reduced vectors for documents and query doc_vectors_2d = reduced_vectors[:-1] # All but the last point (documents) query_vector_2d = reduced_vectors[-1] # Last point (query)
vector = hf_embeddings.embed_query("Please tell me more about LangChain.") print(len(vector))
vector = hf_embeddings.embed_documents([ "Hi, nice to meet you.", "LangChain simplifies the process of building applications with large language models.", "The LangChain English tutorial is structured based on LangChain's official documentation, cookbook, and various practical examples to help users utilize LangChain more easily and effectively.", "LangChain simplifies the process of building applications with large-scale language models.", "Retrieval-Augmented Generation (RAG) is an effective technique for improving AI responses.", ]) print(len(vector)) print([len(item) for item in vector])
os.makedirs("./cache/", exist_ok=True) print(os.path.exists("./cache/")) # Check if the directory exists print(os.access("./cache/", os.W_OK)) # Check if the directory is writable
from langchain.storage import LocalFileStore from langchain_openai import OpenAIEmbeddings from langchain.embeddings import CacheBackedEmbeddings from langchain_community.vectorstores.faiss import FAISS
# Configure basic embeddings using OpenAI embeddings underlying_embeddings = OpenAIEmbeddings( model="bge-m3", base_url='http://localhost:9997/v1', api_key='cannot be empty', # dimensions=1024, )
# Set up a local file storage store = LocalFileStore("./cache/")
# Create embeddings with caching support cached_embedder = CacheBackedEmbeddings.from_bytes_store( underlying_embeddings=underlying_embeddings, document_embedding_cache=store, namespace=underlying_embeddings.model, # Create a cache-backed embedder using the base embedding and storage )
在 embedding 之前 cache 是空的
1
list(store.yield_keys())
[]
加载文档,将其拆分为多个块,对每个块进行嵌入,并将嵌入加载到向量存储中。
1 2 3 4 5 6
from langchain.document_loaders import TextLoader from langchain_text_splitters import CharacterTextSplitter
%time db = FAISS.from_documents(documents, cached_embedder) list(store.yield_keys())[:5]
CPU times: user 6.92 ms, sys: 905 μs, total: 7.82 ms
Wall time: 7.34 ms
['bge-m34b802135-9b69-54ac-835f-f31f0a8f73cf',
'bge-m34fd4987e-f5b6-52f8-91e2-886802754643',
'bge-m3229c1600-8452-5938-b611-45db25315327',
'bge-m3fed9c955-3b6d-5ce9-b7d2-235f35d18610',
'bge-m39668cb63-4ad2-528c-9bf2-aecbfa54e1cd']