Remove [:1000] to read the entire files contents, use it in case OpenAI complains about the amount of used tokens, or increase such value.
docs.py
- Code: Select all Expand view
- from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
import glob
import os
os.environ["OPENAI_API_KEY"] = "sk-...yours"
folder_path = "c:/harbour/src/vm"
file_extension = "*.c"
file_contents = []
for file_path in glob.glob(os.path.join(folder_path, file_extension)):
with open(file_path, "r") as f:
contents = f.read()
file_contents.append(contents[:1000])
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts( file_contents, embeddings )
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(), chain_type="stuff")
query = "what about is this text ?"
docs = docsearch.similarity_search(query)
print( chain.run(input_documents=docs, question=query) )
To run it:
python docs.py
output:
This text describes the license and copyright information for several APIs, including the String API, the Virtual Memory API, the MT mode functions, and the Array API (Harbour level)