ChinarQ-AI Uzaiir commited on
Commit
3ea0a6e
Β·
verified Β·
1 Parent(s): 997e037

Update src/PDFprocess_sample.py (#11)

Browse files

- Update src/PDFprocess_sample.py (b317a4821026779107fdc976c88aad871c78cd3f)


Co-authored-by: Khan <[email protected]>

Files changed (1) hide show
  1. src/PDFprocess_sample.py +49 -49
src/PDFprocess_sample.py CHANGED
@@ -1,49 +1,49 @@
1
- import tempfile
2
- import streamlit as st
3
- import pickle
4
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
- from langchain_community.document_loaders import PyPDFLoader
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_community.vectorstores import FAISS
8
- import faiss
9
-
10
-
11
- def process_pdf(uploaded_file):
12
-
13
- all_documents = []
14
- st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
-
16
- main_placeholder = st.empty()
17
- # Creating a temporary file to store the uploaded PDF's
18
- main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- for uploaded_file in uploaded_file:
20
- with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
- temp_file.write(uploaded_file.read()) ## write file to temporary
22
- temp_file_path = temp_file.name # Get the temporary file path
23
-
24
-
25
- # Load the PDF's from the temporary file path
26
-
27
-
28
- loader = PyPDFLoader(temp_file_path) # Document loader
29
- doc= loader.load() # load Document
30
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
- #final_documents = text_splitter.split_documents(doc)# splitting
33
- final_documents = text_splitter.split_documents(doc)
34
- all_documents.extend(final_documents)
35
-
36
-
37
- if all_documents:
38
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
- st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
- st.session_state.docs = all_documents
41
-
42
- # Save FAISS vector store to disk
43
- faiss_index = st.session_state.vectors.index # Extract FAISS index
44
- faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
- main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
-
47
- else:
48
- st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
-
 
1
+ import tempfile
2
+ import streamlit as st
3
+ import pickle
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ import faiss
9
+
10
+
11
+ def process_pdf(uploaded_file):
12
+
13
+ all_documents = []
14
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
+
16
+ main_placeholder = st.empty()
17
+ # Creating a temporary file to store the uploaded PDF's
18
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ for uploaded_file in uploaded_file:
20
+ with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
+ temp_file.write(uploaded_file.read()) ## write file to temporary
22
+ temp_file_path = temp_file.name # Get the temporary file path
23
+
24
+
25
+ # Load the PDF's from the temporary file path
26
+
27
+
28
+ loader = PyPDFLoader(temp_file_path) # Document loader
29
+ doc= loader.load() # load Document
30
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
+ #final_documents = text_splitter.split_documents(doc)# splitting
33
+ final_documents = text_splitter.split_documents(doc)
34
+ all_documents.extend(final_documents)
35
+
36
+
37
+ if all_documents:
38
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
+ st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
+ st.session_state.docs = all_documents
41
+
42
+ # Save FAISS vector store to disk
43
+ faiss_index = st.session_state.vectors.index # Extract FAISS index
44
+ faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
+
47
+ else:
48
+ st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
+