Spaces:

Chinar-Q-AI
/

pdf-Interactor

Sleeping

App Files Files Community

ChinarQ-AI

Uzaiir commited on May 19

Commit

6d0dda6

verified ·

1 Parent(s): 6a529b0

Updated Preproceesing file (#25)

Browse files

- Updated Preproceesing file (c87ba057e3d7cd4eef91e544fe5c4334ea54b160)

Co-authored-by: Khan <[email protected]>

Files changed (1) hide show

src/PDFprocess_sample.py +41 -91

src/PDFprocess_sample.py CHANGED Viewed

@@ -1,104 +1,54 @@
-# import tempfile
-# import streamlit as st
-# import pickle
-# from langchain_google_genai import GoogleGenerativeAIEmbeddings
-# from langchain_community.document_loaders import PyPDFLoader
-# from langchain.text_splitter import RecursiveCharacterTextSplitter
-# from langchain_community.vectorstores import FAISS
-# import faiss
-# # def process_pdf(uploaded_file):
-# #     all_documents = []
-# #     st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-# #     main_placeholder = st.empty()
-# #     # Creating  a temporary file to store the uploaded PDF's
-# #     main_placeholder.text("Data Loading...Started...✅✅✅")
-# #     for uploaded_file in uploaded_file:
-# #         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
-# #             temp_file.write(uploaded_file.read()) ## write file to temporary
-# #             temp_file_path = temp_file.name  # Get the temporary file path
-# #             # Load the PDF's from the temporary file path
-# #         loader = PyPDFLoader(temp_file_path) # Document loader
-# #         doc= loader.load() # load Document
-# #         main_placeholder.text("Text Splitter...Started...✅✅✅")
-# #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
-# #         #final_documents = text_splitter.split_documents(doc)# splitting
-# #         final_documents = text_splitter.split_documents(doc)
-# #         all_documents.extend(final_documents)
-# #         if all_documents:
-# #             main_placeholder.text("Embedding Vector Started Building...✅✅✅")
-# #             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
-# #             st.session_state.docs = all_documents
-# #             # Save FAISS vector store to disk
-# #             faiss_index = st.session_state.vectors.index  # Extract FAISS index
-# #             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
-# #             main_placeholder.text("Vector database created!...✅✅✅")
-# #         else:
-# #             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
-import streamlit as st
-import faiss
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_google_genai import GoogleGenerativeAIEmbeddings
-from langchain.vectorstores import FAISS
-def process_pdf_from_path(file_path):
-    """
-    Processes a PDF from a given path by:
-    - Loading the PDF
-    - Splitting it into manageable chunks
-    - Creating embeddings with Gemini
-    - Saving the FAISS vector index to disk
-    Parameters:
-    file_path (str): Path to the uploaded PDF file
-    """
-    all_documents = []
-    try:
-        # Initialize embeddings model
-        st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-        main_placeholder = st.empty()
-        main_placeholder.text("Loading PDF and preparing text... ✅")
-        # Load PDF document
-        loader = PyPDFLoader(file_path)
-        documents = loader.load()
-        # Split documents into smaller chunks
-        main_placeholder.text("Splitting text into chunks... ✅")
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        final_documents = text_splitter.split_documents(documents)
-        all_documents.extend(final_documents)
         if all_documents:
-            main_placeholder.text("Creating vector embeddings... ✅")
-            # Generate vector store
-            st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
-            st.session_state.docs = all_documents
-            # Save FAISS index
-            faiss_index = st.session_state.vectors.index
-            faiss.write_index(faiss_index, "/tmp/faiss_index.bin")
-            main_placeholder.text("Vector database created successfully! 🎉")
         else:
-            st.error("No valid documents found in the uploaded PDF.")
-    except Exception as e:
-        st.error(f"An error occurred while processing the PDF: {e}")

+import tempfile
+import streamlit as st
+import pickle
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+import faiss
+import os
+def process_pdf(uploaded_file):
+    all_documents = []
+    st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    main_placeholder = st.empty()
+    # Creating  a temporary file to store the uploaded PDF's
+    main_placeholder.text("Data Loading...Started...✅✅✅")
+    for uploaded_file in uploaded_file:
+        with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
+            temp_file.write(uploaded_file.read()) ## write file to temporary
+            temp_file_path = temp_file.name  # Get the temporary file path
+            # Load the PDF's from the temporary file path
+        loader = PyPDFLoader(temp_file_path) # Document loader
+        doc= loader.load() # load Document
+        main_placeholder.text("Text Splitter...Started...✅✅✅")
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
+        #final_documents = text_splitter.split_documents(doc)# splitting
+        final_documents = text_splitter.split_documents(doc)
+        all_documents.extend(final_documents)
         if all_documents:
+            main_placeholder.text("Embedding Vector Started Building...✅✅✅")
+            st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
+            st.session_state.docs = all_documents
+            # Save FAISS vector store to disk
+            faiss_index = st.session_state.vectors.index  # Extract FAISS index
+            # faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
+            output_path = "/tmp/faiss_index.bin"  # or another writable path
+            faiss.write_index(faiss_index, output_path)
+            main_placeholder.text("Vector database created!...✅✅✅")
         else:
+            st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")