Rename files using llm.datasette.io

Fork of https://github.com/heversonbr/scientific-paper-pdf-rename

#!/usr/bin/env python3 import fitz import os import sys import platform import pkg_resources import signal import string import hashlib import shutil import re import logging import llm from src.helper import * # Define logger / logger config log_level = logging.INFO logging.basicConfig(format='[%(asctime)s] - [%(levelname)s]- %(message)s', level=log_level) logger = logging.getLogger() set_helper_logger(log_level) # Set your OpenAI API key model = llm.get_model("gpt-3.5-turbo") model.key = "<!-- INSERT OPENAI KEY -->" def keyboardInterruptHandler(signal, frame): logger.info('You pressed Ctrl+C! Leaving...'.format(signal)) sys.exit(0) def validate_arguments(arguments): if len(arguments) > 2: logger.error('Wrong number of arguments!') print_usage() sys.exit() if len(sys.argv) == 1: logger.error("Missing arguments!") print_usage() sys.exit() if len(arguments) == 2: target = os.path.abspath(arguments[1]) base_dir = '' filename = '' if os.path.exists(target): if os.path.isdir(target): base_dir = os.path.abspath(target) else: if target.endswith('.pdf'): base_dir = os.path.dirname(target) filename = os.path.basename(target) else: logger.error('Argument is not a pdf file') sys.exit() else: logger.error("Directory or file ["+ arguments[1] + "] path does not exist!") sys.exit() return base_dir, filename def hash_file(target_file): blocksize = 65536 hasher = hashlib.new('sha256') target_file = os.path.abspath(target_file) if os.path.isfile(target_file): with open(target_file, 'rb') as f: while True: data = f.read(blocksize) if not data: break hasher.update(data) return hasher.hexdigest() else: logger.error(target_file + ' is not a file') sys.exit() def parse_title(title, max_length=None): if max_length == None: max_length = 125 if len(title) > max_length: title = title[:max_length] title = re.sub(r'[^a-zA-Z0-9]+', ' ', title) title = title.strip() title = re.sub(r'\s', '_', title) title = string.capwords(title) + '.pdf' return title def get_page_text(current_page): python_version = platform.python_version() library_name = "PyMuPDF" library_version = pkg_resources.get_distribution(library_name).version acceptable_python_versions = ["3.11", "3.7", "3.8"] acceptable_pymupdf_versions = ["1.22", "1.18"] if any(python_version.startswith(ver) for ver in acceptable_python_versions) and \ any(library_version.startswith(ver) for ver in acceptable_pymupdf_versions): blocks = current_page.get_text('dict')['blocks'] logger.debug('Python version is: ' + python_version) logger.debug('PyMuPDF version is: ' + library_version) else: logger.warning('Python version is: ' + python_version) logger.warning('PyMuPDF version is: ' + library_version) logger.error('Your Python version or PyMuPDF is not at the required level!') logger.error('Please ensure that both meet the specified version requirements for this script to function properly.') sys.exit() return blocks def get_best_title_from_llm(text_content): response = model.prompt( text_content, system='Given the extracted text from a PDF document, craft the most fitting title. The title should be accurate, clear, concise, and relevant to researchers, adhering, if possible, to the standard of ABNT NBR 6023 (e.g., "MARX, Karl; ENGELS, Friedrich. The communist manifesto. In: Ideals and ideologies. Routledge, 2019. p. 243-255."). However, ensure the title adheres to filename restrictions: avoid characters like `/ \ ? * : < > |` and control characters (ASCII 0-31). Prioritize alphanumeric characters, hyphens, underscores, and periods. Your response MUST contain ONLY the crafted title, NO MORE, NO LESS.') title = response.text().strip() return title def scan_title(full_file_name, page_num=None): if page_num is None: page_num = 0 doc = fitz.open(full_file_name) meta_title = doc.metadata['title'].strip() if len(meta_title) > 5: meta_title = parse_title(meta_title) page = doc.load_page(page_num) size_text_tup_list = [] title = '' blocks = get_page_text(page) text_content = "" for blk in blocks: if blk['type'] == 0: for line in blk['lines']: if line['dir'] == (1.0, 0.0) and line['wmode'] == 0: for span in line['spans']: size_text_tup = (span['size'], span['text'], span['origin']) size_text_tup_list.append(size_text_tup) text_content += span['text'] + " " sorted_size_text_list = sorted(size_text_tup_list, key=lambda text_size: text_size[0], reverse=True) larger_font = 0 title_max_lines = 5 for item in sorted_size_text_list: t_font_size = item[0] t_text = item[1] t_text_len = len(t_text) if t_text_len > 2: if t_font_size > larger_font: larger_font = t_font_size title = t_text.strip() + ' ' title_max_lines -= 1 elif t_font_size == larger_font: title = title + t_text.strip() + ' ' title_max_lines -= 1 if title_max_lines < 1: break doc.close() parsed_found_title = parse_title(title) best_title = get_best_title_from_llm(text_content) return meta_title, parsed_found_title, best_title def do_rename(fullpath_current_filename, fullpath_new_filename): if fullpath_current_filename == fullpath_new_filename: logger.warning('Current filename and found title are already the same. Skipping...') return False try: os.rename(fullpath_current_filename, fullpath_new_filename) except: logger.error("An exception occurred. File not renamed!") return False return True def confirm_to_continue(): valid_choices = ['c', 's', 'a'] choice = input('Choose [c] to continue, [s] to skip, or [a] to abort : \n') while(choice not in valid_choices): choice = input('Choose [c] to continue or [a] to abort : \n') if choice == 'c': return True if choice == 's': return False if choice == 'a': logger.info("aborting...") sys.exit() def select_loop_type(): logger.info('Choose [1] : for renaming all pdf files in the directory') logger.info('Choose [2] : for one-by-one pdf file confirmation') valid_choices = ['1', '2', 'q'] choice = input('Choose [1], [2] or q [quit] : \n') while(choice not in valid_choices): choice = input('Choose [1], [2] or q [quit] : \n') if choice == '1': return '1' if choice == '2': return '2' if choice == 'q': logger.info("aborting...") sys.exit() def move_file(fullpath_src_file, destination_dir, dest_file): if(not os.path.isdir(destination_dir)): os.mkdir(os.path.join(destination_dir)) try: shutil.move(fullpath_src_file, os.path.join(destination_dir, dest_file)) except OSError as e: logger.exception(e.strerror) logger.warning('File ' + fullpath_src_file + ' was not moved to ' + destination_dir + '/auto_renamed_pdf' ) def search_candidate_title(src_dir, current_file): meta_title, font_based_title, best_title = scan_title(src_dir + '/' + current_file) if len(best_title) > 0: return best_title elif len(font_based_title) > 0: return font_based_title elif len(meta_title) > 0: return meta_title else: logger.info('No potential Title was found for : ' + current_file) return None def rename_files_in_dir(base_dir): renamed_counter = 0 total_counter = 0 file_fingerprints = [] full_path_base_dir = os.path.abspath(base_dir) if os.path.isdir(full_path_base_dir): list_of_files = [file for file in os.listdir(full_path_base_dir) if file.endswith('.pdf')] if len(list_of_files) < 1: logger.info('no pdf files found in the target directory') return renamed_counter, total_counter else: logger.info(str(len(list_of_files)) + ' files found in the target directory!') loop_type = select_loop_type() for current_file in list_of_files: total_counter += 1 fingerprint = hash_file(full_path_base_dir + '/' + current_file) logger.info('*' * 80) logger.info('[Current file name] : ' + current_file) logger.debug('[Current file hash] : ' + fingerprint) logger.debug('[loop_type] : ' + loop_type) if loop_type == '2': answer = confirm_to_continue() if answer is False: continue if fingerprint not in file_fingerprints: file_fingerprints.append(fingerprint) found_title = search_candidate_title(full_path_base_dir, current_file) if found_title is not None: renamed = do_rename(full_path_base_dir + '/' + current_file, full_path_base_dir + '/' + found_title) if renamed: move_file(full_path_base_dir + '/' + found_title, full_path_base_dir + '/auto_renamed_pdf', found_title) renamed_counter += 1 else: logger.warning('Another file with the same content (hash) was found in the source directory!') logger.info('Skipping file: ' + current_file + ' adding prefix `duplicated_`to it') os.rename(full_path_base_dir + '/' + current_file, full_path_base_dir + '/duplicated_' + current_file) else: logger.error('Directory does not exist!') return renamed_counter, total_counter def rename_target_file(src_dir, filename): fullpath_filename = src_dir + '/' + filename if os.path.isfile(fullpath_filename): if os.path.abspath(fullpath_filename).endswith('.pdf'): found_title = search_candidate_title(src_dir, filename) if found_title is not None: renamed = do_rename(fullpath_filename, src_dir + '/' + found_title) if renamed: move_file(src_dir + '/' + found_title, src_dir + '/auto_renamed_pdf', found_title) return True else: logger.debug("File is not a pdf!") return False else: logger.error("File does not exist!") return False def main(): print_header() base_dir, filename = validate_arguments(sys.argv) signal.signal(signal.SIGINT, keyboardInterruptHandler) target_path = base_dir + '/' + filename logger.info('[Target Path]: ' + target_path) rename_counter = 0 if os.path.isdir(target_path): rename_counter, total_counter = rename_files_in_dir(base_dir) logger.info('*' * 80) logger.info('Finished => Total files: ' + str(total_counter) + ' Renamed files: ' + str(rename_counter)) else: renamed = rename_target_file(base_dir, filename) if renamed: rename_counter += 1 logger.info('*' * 80) logger.info('Finished => Renamed files : ' + str(rename_counter)) if __name__ == "__main__": main()
https://github.com/heversonbr/scientific-paper-pdf-rename