
Launch Open-Source Apps with LangChain
Notebook

In [1]:
1
%%writefile requirements.txt2
langchain==0.0.3393
openai==1.3.34
pdf2image==1.17.05
pdfminer==201911256
pdfminer.six==202211057
pillow_heif==0.13.18
tabulate==0.9.09
tiktoken==0.5.110
unstructured==0.11.011
opencv-contrib-python-headless==4.8.1.7812
unstructured.pytesseract==0.3.1213
unstructured.inference==0.7.15
In [2]:
1
%conda install -y --quiet poppler tesseract
In [3]:
1
%pip install -r requirements.txt --quiet
In [4]:
1
import nltk2
nltk.download('punkt_tab')3
nltk.download('averaged_perceptron_tagger_eng')
In [5]:
1
from langchain.document_loaders import OnlinePDFLoader2
3
loader = OnlinePDFLoader("http://leavcom.com/pdf/DBpdf.pdf")4
5
data = loader.load()
In [6]:
1
from langchain.text_splitter import RecursiveCharacterTextSplitter2
3
print (f"You have {len(data)} document(s) in your data")4
print (f"There are {len(data[0].page_content)} characters in your document")
In [7]:
1
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)2
texts = text_splitter.split_documents(data)3
4
print (f"You have {len(texts)} pages")
In [8]:
1
%%sql2
DROP DATABASE IF EXISTS pdf_db;3
CREATE DATABASE IF NOT EXISTS pdf_db;
Out [8]:
Action Required
Make sure to select the pdf_db database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.
In [9]:
1
%%sql2
DROP TABLE IF EXISTS pdf_docs1;3
CREATE TABLE IF NOT EXISTS pdf_docs1 (4
id INT PRIMARY KEY,5
content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,6
vector BLOB7
);
Out [9]:
In [10]:
1
import os2
import getpass3
4
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
In [11]:
1
import json2
import sqlalchemy as sa3
from langchain.embeddings import OpenAIEmbeddings4
from singlestoredb import create_engine5
6
conn = create_engine().connect()7
8
embedder = OpenAIEmbeddings()9
10
# Fetch all embeddings in one call11
embeddings = embedder.embed_documents([doc.page_content for doc in texts])12
13
# Build query parameters14
params = []15
for i, (text_content, embedding) in enumerate(zip(texts, embeddings)):16
params.append(dict(id=i+1, content=text_content, vector=json.dumps(embedding)))17
18
stmt = sa.text("""19
INSERT INTO pdf_docs1 (20
id,21
content,22
vector23
)24
VALUES (25
:id,26
:content,27
JSON_ARRAY_PACK_F32(:vector)28
)29
""")30
31
conn.execute(stmt, params)
Out [11]:
<sqlalchemy.engine.cursor.CursorResult at 0x7ec535dc5e80>
In [12]:
1
%%sql2
SELECT JSON_ARRAY_UNPACK_F32(vector) as vector3
FROM pdf_docs14
LIMIT 1;
Out [12]:
In [13]:
1
query_text = "Will object-oriented databases be commercially successful?"2
3
query_embedding = embedder.embed_documents([query_text])[0]4
5
stmt = sa.text("""6
SELECT7
content,8
DOT_PRODUCT_F32(JSON_ARRAY_PACK_F32(:embedding), vector) AS score9
FROM pdf_docs110
ORDER BY score DESC11
LIMIT 112
""")13
14
results = conn.execute(stmt, dict(embedding=json.dumps(query_embedding)))15
16
for row in results:17
print(row[0])
In [14]:
1
import openai2
3
client = openai.OpenAI()4
5
prompt = f"The user asked: {query_text}. The most similar text from the document is: {row[0]}"6
7
response = client.chat.completions.create(8
model="gpt-3.5-turbo",9
messages=[10
{"role": "system", "content": "You are a helpful assistant."},11
{"role": "user", "content": prompt}12
]13
)14
15
print(response.choices[0].message.content)
Clean up
In [15]:
1
%%sql2
DROP DATABASE IF EXISTS pdf_db
Out [15]:

Details
About this Template
LangChain connector to use SingleStoreDB as your vector database for your apps.
This Notebook can be run in Standard and Enterprise deployments.
Tags
License
This Notebook has been released under the Apache 2.0 open source license.
See Notebook in action
Launch this notebook in SingleStore and start executing queries instantly.