ChinarQ-AI Uzaiir commited on
Commit
6d0dda6
Β·
verified Β·
1 Parent(s): 6a529b0

Updated Preproceesing file (#25)

Browse files

- Updated Preproceesing file (c87ba057e3d7cd4eef91e544fe5c4334ea54b160)


Co-authored-by: Khan <[email protected]>

Files changed (1) hide show
  1. src/PDFprocess_sample.py +41 -91
src/PDFprocess_sample.py CHANGED
@@ -1,104 +1,54 @@
1
- # import tempfile
2
- # import streamlit as st
3
- # import pickle
4
- # from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
- # from langchain_community.document_loaders import PyPDFLoader
6
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- # from langchain_community.vectorstores import FAISS
8
- # import faiss
 
9
 
10
 
11
- # # def process_pdf(uploaded_file):
12
 
13
- # # all_documents = []
14
- # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
- # # main_placeholder = st.empty()
17
- # # # Creating a temporary file to store the uploaded PDF's
18
- # # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- # # for uploaded_file in uploaded_file:
20
- # # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
- # # temp_file.write(uploaded_file.read()) ## write file to temporary
22
- # # temp_file_path = temp_file.name # Get the temporary file path
23
 
24
 
25
- # # # Load the PDF's from the temporary file path
26
 
27
 
28
- # # loader = PyPDFLoader(temp_file_path) # Document loader
29
- # # doc= loader.load() # load Document
30
- # # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
- # # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
- # # #final_documents = text_splitter.split_documents(doc)# splitting
33
- # # final_documents = text_splitter.split_documents(doc)
34
- # # all_documents.extend(final_documents)
35
 
36
 
37
- # # if all_documents:
38
- # # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
- # # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
- # # st.session_state.docs = all_documents
41
-
42
- # # # Save FAISS vector store to disk
43
- # # faiss_index = st.session_state.vectors.index # Extract FAISS index
44
- # # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
- # # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
-
47
- # # else:
48
- # # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
-
50
-
51
- import streamlit as st
52
- import faiss
53
- from langchain_community.document_loaders import PyPDFLoader
54
- from langchain.text_splitter import RecursiveCharacterTextSplitter
55
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
56
- from langchain.vectorstores import FAISS
57
-
58
- def process_pdf_from_path(file_path):
59
- """
60
- Processes a PDF from a given path by:
61
- - Loading the PDF
62
- - Splitting it into manageable chunks
63
- - Creating embeddings with Gemini
64
- - Saving the FAISS vector index to disk
65
-
66
- Parameters:
67
- file_path (str): Path to the uploaded PDF file
68
- """
69
-
70
- all_documents = []
71
-
72
- try:
73
- # Initialize embeddings model
74
- st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
75
-
76
- main_placeholder = st.empty()
77
- main_placeholder.text("Loading PDF and preparing text... βœ…")
78
-
79
- # Load PDF document
80
- loader = PyPDFLoader(file_path)
81
- documents = loader.load()
82
-
83
- # Split documents into smaller chunks
84
- main_placeholder.text("Splitting text into chunks... βœ…")
85
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
86
- final_documents = text_splitter.split_documents(documents)
87
- all_documents.extend(final_documents)
88
-
89
  if all_documents:
90
- main_placeholder.text("Creating vector embeddings... βœ…")
91
- # Generate vector store
92
- st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
93
- st.session_state.docs = all_documents
94
-
95
- # Save FAISS index
96
- faiss_index = st.session_state.vectors.index
97
- faiss.write_index(faiss_index, "/tmp/faiss_index.bin")
98
 
99
- main_placeholder.text("Vector database created successfully! πŸŽ‰")
 
 
 
 
100
  else:
101
- st.error("No valid documents found in the uploaded PDF.")
102
-
103
- except Exception as e:
104
- st.error(f"An error occurred while processing the PDF: {e}")
 
1
+ import tempfile
2
+ import streamlit as st
3
+ import pickle
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ import faiss
9
+ import os
10
 
11
 
12
+ def process_pdf(uploaded_file):
13
 
14
+ all_documents = []
15
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
16
 
17
+ main_placeholder = st.empty()
18
+ # Creating a temporary file to store the uploaded PDF's
19
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
20
+ for uploaded_file in uploaded_file:
21
+ with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
22
+ temp_file.write(uploaded_file.read()) ## write file to temporary
23
+ temp_file_path = temp_file.name # Get the temporary file path
24
 
25
 
26
+ # Load the PDF's from the temporary file path
27
 
28
 
29
+ loader = PyPDFLoader(temp_file_path) # Document loader
30
+ doc= loader.load() # load Document
31
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
32
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
33
+ #final_documents = text_splitter.split_documents(doc)# splitting
34
+ final_documents = text_splitter.split_documents(doc)
35
+ all_documents.extend(final_documents)
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  if all_documents:
39
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
40
+ st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
41
+ st.session_state.docs = all_documents
42
+
43
+ # Save FAISS vector store to disk
44
+ faiss_index = st.session_state.vectors.index # Extract FAISS index
45
+ # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
 
46
 
47
+ output_path = "/tmp/faiss_index.bin" # or another writable path
48
+ faiss.write_index(faiss_index, output_path)
49
+
50
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
51
+
52
  else:
53
+ st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
54
+