import fitz # PyMuPDF
import os
import sys
from glob import glob
def extract_text_from_pdf(pdf_path, txt_output_path):
try:
# Open the PDF file in binary mode
pdf_document = fitz.open(pdf_path)
# Iterate through each page
for page_number in range(pdf_document.page_count):
# Select the page
page = pdf_document[page_number]
# Extract text from the page
text = page.get_text()
# Write the extracted text to a text file
with open(txt_output_path, "a", encoding="utf-8") as text_file:
text_file.write(text + '\n\n')
# Close the PDF file
pdf_document.close()
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
def main(input_folder, output_folder):
# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)
# Find all PDF files in the input folder
pdf_files = glob(input_folder)
for pdf_file in pdf_files:
# Generate the output text file path
txt_output_path = os.path.join(output_folder, os.path.basename(pdf_file).replace('.pdf', '.txt'))
# Extract text from the PDF and save it to the output folder
extract_text_from_pdf(pdf_file, txt_output_path)
print(f"Processed {pdf_file} to {txt_output_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python3 script.py /input/folder/*.pdf /output/folder")
sys.exit(1)
input_folder = sys.argv[1]
output_folder = sys.argv[2]
main(input_folder, output_folder)
Reference:
https://github.com/simonw/files-to-prompt