Convert `.pdf` to `UTF-8`

import fitz # PyMuPDF import os import sys from glob import glob def extract_text_from_pdf(pdf_path, txt_output_path): try: # Open the PDF file in binary mode pdf_document = fitz.open(pdf_path) # Iterate through each page for page_number in range(pdf_document.page_count): # Select the page page = pdf_document[page_number] # Extract text from the page text = page.get_text() # Write the extracted text to a text file with open(txt_output_path, "a", encoding="utf-8") as text_file: text_file.write(text + '\n\n') # Close the PDF file pdf_document.close() except Exception as e: print(f"Error processing {pdf_path}: {e}") def main(input_folder, output_folder): # Ensure output folder exists os.makedirs(output_folder, exist_ok=True) # Find all PDF files in the input folder pdf_files = glob(input_folder) for pdf_file in pdf_files: # Generate the output text file path txt_output_path = os.path.join(output_folder, os.path.basename(pdf_file).replace('.pdf', '.txt')) # Extract text from the PDF and save it to the output folder extract_text_from_pdf(pdf_file, txt_output_path) print(f"Processed {pdf_file} to {txt_output_path}") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python3 script.py /input/folder/*.pdf /output/folder") sys.exit(1) input_folder = sys.argv[1] output_folder = sys.argv[2] main(input_folder, output_folder)

Comments (0)