Rename files using llm.datasette.io

Fork of https://github.com/heversonbr/scientific-paper-pdf-rename
#!/usr/bin/env python3
import fitz
import os
import sys
import platform
import pkg_resources
import signal
import string
import hashlib
import shutil
import re
import logging
import llm
from src.helper import *

# Define logger / logger config
log_level = logging.INFO
logging.basicConfig(format='[%(asctime)s] - [%(levelname)s]- %(message)s', level=log_level)
logger = logging.getLogger()
set_helper_logger(log_level)

# Set your OpenAI API key
model = llm.get_model("gpt-3.5-turbo")
model.key = "<!-- INSERT OPENAI KEY -->"

def keyboardInterruptHandler(signal, frame):
    logger.info('You pressed Ctrl+C! Leaving...'.format(signal))
    sys.exit(0)

def validate_arguments(arguments):
    if len(arguments) > 2:
        logger.error('Wrong number of arguments!')
        print_usage()
        sys.exit()
    if len(sys.argv) == 1:
        logger.error("Missing arguments!")
        print_usage()
        sys.exit()
    if len(arguments) == 2:
        target = os.path.abspath(arguments[1])
        base_dir = ''
        filename = ''
        if os.path.exists(target):
            if os.path.isdir(target):
                base_dir = os.path.abspath(target)
            else:
                if target.endswith('.pdf'):
                    base_dir = os.path.dirname(target)
                    filename = os.path.basename(target)
                else:
                    logger.error('Argument is not a pdf file')
                    sys.exit()
        else:
            logger.error("Directory or file ["+ arguments[1] + "] path does not exist!")
            sys.exit()
        return base_dir, filename

def hash_file(target_file):
    blocksize = 65536
    hasher = hashlib.new('sha256')
    target_file = os.path.abspath(target_file)
    if os.path.isfile(target_file):
        with open(target_file, 'rb') as f:
            while True:
                data = f.read(blocksize)
                if not data:
                    break
                hasher.update(data)
        return hasher.hexdigest()
    else:
        logger.error(target_file + ' is not a file')
        sys.exit()

def parse_title(title, max_length=None):
    if max_length == None:
        max_length = 125
    if len(title) > max_length:
        title = title[:max_length]
    title = re.sub(r'[^a-zA-Z0-9]+', ' ', title)
    title = title.strip()
    title = re.sub(r'\s', '_', title)
    title = string.capwords(title) + '.pdf'
    return title

def get_page_text(current_page):
    python_version = platform.python_version()
    library_name = "PyMuPDF"
    library_version = pkg_resources.get_distribution(library_name).version

    acceptable_python_versions = ["3.11", "3.7", "3.8"]
    acceptable_pymupdf_versions = ["1.22", "1.18"]

    if any(python_version.startswith(ver) for ver in acceptable_python_versions) and \
       any(library_version.startswith(ver) for ver in acceptable_pymupdf_versions):
        blocks = current_page.get_text('dict')['blocks']
        logger.debug('Python version is: ' + python_version)
        logger.debug('PyMuPDF version is: ' + library_version)
    else:
        logger.warning('Python version is: ' + python_version)
        logger.warning('PyMuPDF version is: ' + library_version)
        logger.error('Your Python version or PyMuPDF is not at the required level!')
        logger.error('Please ensure that both meet the specified version requirements for this script to function properly.')
        sys.exit()

    return blocks

def get_best_title_from_llm(text_content):
    response = model.prompt(
        text_content,
        system='Given the extracted text from a PDF document, craft the most fitting title. The title should be accurate, clear, concise, and relevant to researchers, adhering, if possible, to the standard of ABNT NBR 6023 (e.g., "MARX, Karl; ENGELS, Friedrich. The communist manifesto. In: Ideals and ideologies. Routledge, 2019. p. 243-255."). However, ensure the title adheres to filename restrictions: avoid characters like `/ \ ? * : < > |` and control characters (ASCII 0-31). Prioritize alphanumeric characters, hyphens, underscores, and periods. Your response MUST contain ONLY the crafted title, NO MORE, NO LESS.')
    title = response.text().strip()
    return title

def scan_title(full_file_name, page_num=None):
    if page_num is None:
        page_num = 0
    doc = fitz.open(full_file_name)
    meta_title = doc.metadata['title'].strip()
    if len(meta_title) > 5:
        meta_title = parse_title(meta_title)
    page = doc.load_page(page_num)
    size_text_tup_list = []
    title = ''
    blocks = get_page_text(page)
    text_content = ""
    for blk in blocks:
        if blk['type'] == 0:
            for line in blk['lines']:
                if line['dir'] == (1.0, 0.0) and line['wmode'] == 0:
                    for span in line['spans']:
                        size_text_tup = (span['size'], span['text'], span['origin'])
                        size_text_tup_list.append(size_text_tup)
                        text_content += span['text'] + " "
    sorted_size_text_list = sorted(size_text_tup_list, key=lambda text_size: text_size[0], reverse=True)
    larger_font = 0
    title_max_lines = 5
    for item in sorted_size_text_list:
        t_font_size = item[0]
        t_text = item[1]
        t_text_len = len(t_text)
        if t_text_len > 2:
            if t_font_size > larger_font:
                larger_font = t_font_size
                title = t_text.strip() + ' '
                title_max_lines -= 1
            elif t_font_size == larger_font:
                title = title + t_text.strip() + ' '
                title_max_lines -= 1
            if title_max_lines < 1:
                break
    doc.close()
    parsed_found_title = parse_title(title)
    best_title = get_best_title_from_llm(text_content)
    return meta_title, parsed_found_title, best_title

def do_rename(fullpath_current_filename, fullpath_new_filename):
    if fullpath_current_filename == fullpath_new_filename:
        logger.warning('Current filename and found title are already the same. Skipping...')
        return False
    try:
        os.rename(fullpath_current_filename, fullpath_new_filename)
    except:
        logger.error("An exception occurred. File not renamed!")
        return False
    return True

def confirm_to_continue():
    valid_choices = ['c', 's', 'a']
    choice = input('Choose [c] to continue, [s] to skip, or [a] to abort : \n')
    while(choice not in valid_choices):
        choice = input('Choose [c] to continue or [a] to abort : \n')
    if choice == 'c':
        return True
    if choice == 's':
        return False
    if choice == 'a':
        logger.info("aborting...")
        sys.exit()

def select_loop_type():
    logger.info('Choose [1] : for renaming all pdf files in the directory')
    logger.info('Choose [2] : for one-by-one pdf file confirmation')
    valid_choices = ['1', '2', 'q']
    choice = input('Choose [1], [2] or q [quit] : \n')
    while(choice not in valid_choices):
        choice = input('Choose [1], [2] or q [quit] : \n')
    if choice == '1':
        return '1'
    if choice == '2':
        return '2'
    if choice == 'q':
        logger.info("aborting...")
        sys.exit()

def move_file(fullpath_src_file, destination_dir, dest_file):
    if(not os.path.isdir(destination_dir)):
        os.mkdir(os.path.join(destination_dir))
    try:
        shutil.move(fullpath_src_file, os.path.join(destination_dir, dest_file))
    except OSError as e:
        logger.exception(e.strerror)
        logger.warning('File ' + fullpath_src_file + ' was not moved to ' + destination_dir + '/auto_renamed_pdf' )

def search_candidate_title(src_dir, current_file):
    meta_title, font_based_title, best_title = scan_title(src_dir + '/' + current_file)
    if len(best_title) > 0:
        return best_title
    elif len(font_based_title) > 0:
        return font_based_title
    elif len(meta_title) > 0:
        return meta_title
    else:
        logger.info('No potential Title was found for : ' + current_file)
        return None

def rename_files_in_dir(base_dir):
    renamed_counter = 0
    total_counter = 0
    file_fingerprints = []
    full_path_base_dir = os.path.abspath(base_dir)
    if os.path.isdir(full_path_base_dir):
        list_of_files = [file for file in os.listdir(full_path_base_dir) if file.endswith('.pdf')]
        if len(list_of_files) < 1:
            logger.info('no pdf files found in the target directory')
            return renamed_counter, total_counter
        else:
            logger.info(str(len(list_of_files)) + ' files found in the target directory!')
        loop_type = select_loop_type()
        for current_file in list_of_files:
            total_counter += 1
            fingerprint = hash_file(full_path_base_dir + '/' + current_file)
            logger.info('*' * 80)
            logger.info('[Current file name] : ' + current_file)
            logger.debug('[Current file hash] : ' + fingerprint)
            logger.debug('[loop_type] : ' + loop_type)
            if loop_type == '2':
                answer = confirm_to_continue()
                if answer is False:
                    continue
            if fingerprint not in file_fingerprints:
                file_fingerprints.append(fingerprint)
                found_title = search_candidate_title(full_path_base_dir, current_file)
                if found_title is not None:
                    renamed = do_rename(full_path_base_dir + '/' + current_file, full_path_base_dir + '/' + found_title)
                    if renamed:
                        move_file(full_path_base_dir + '/' + found_title, full_path_base_dir + '/auto_renamed_pdf', found_title)
                        renamed_counter += 1
            else:
                logger.warning('Another file with the same content (hash) was found in the source directory!')
                logger.info('Skipping file: ' + current_file + ' adding prefix `duplicated_`to it')
                os.rename(full_path_base_dir + '/' + current_file, full_path_base_dir + '/duplicated_' + current_file)
    else:
        logger.error('Directory does not exist!')
    return renamed_counter, total_counter

def rename_target_file(src_dir, filename):
    fullpath_filename = src_dir + '/' + filename
    if os.path.isfile(fullpath_filename):
        if os.path.abspath(fullpath_filename).endswith('.pdf'):
            found_title = search_candidate_title(src_dir, filename)
            if found_title is not None:
                renamed = do_rename(fullpath_filename, src_dir + '/' + found_title)
                if renamed:
                    move_file(src_dir + '/' + found_title, src_dir + '/auto_renamed_pdf', found_title)
                return True
        else:
            logger.debug("File is not a pdf!")
            return False
    else:
        logger.error("File does not exist!")
        return False

def main():
    print_header()
    base_dir, filename = validate_arguments(sys.argv)
    signal.signal(signal.SIGINT, keyboardInterruptHandler)
    target_path = base_dir + '/' + filename
    logger.info('[Target Path]: ' + target_path)
    rename_counter = 0
    if os.path.isdir(target_path):
        rename_counter, total_counter = rename_files_in_dir(base_dir)
        logger.info('*' * 80)
        logger.info('Finished => Total files: ' + str(total_counter) + ' Renamed files: ' + str(rename_counter))
    else:
        renamed = rename_target_file(base_dir, filename)
        if renamed:
            rename_counter += 1
        logger.info('*' * 80)
        logger.info('Finished => Renamed files : ' + str(rename_counter))

if __name__ == "__main__":
    main()
URL: https://ib.bsb.br/llm-renamer