bulkGPT: multithreading for OpenAI's output limits

import os
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed
import tiktoken
import logging
from tqdm import tqdm
import time
import random

# Setup the logging system
logging.basicConfig(level=logging.INFO)

# Initialize OpenAI client with the API key
api_key = os.getenv('OPENAI_KEY')
if not api_key:
    raise ValueError("API key not found. Please set the OPENAI_KEY environment variable.")
client = openai.OpenAI(api_key=api_key)

def load_text(file_path):
    """Load text from a specified file."""
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except Exception as e:
        logging.error(f'Failed to load file {file_path}: {str(e)}')
        raise

def initialize_files(output_file, log_file):
    """Initialize the output and log files by creating empty files."""
    try:
        open(output_file, 'w').close()
        open(log_file, 'w').close()
    except Exception as e:
        logging.error(f'Failed to initialize files {output_file}, {log_file}: {str(e)}')
        raise

def save_to_file(responses, output_file):
    """Save API responses to an output file."""
    try:
        with open(output_file, 'w') as file:
            for response in responses:
                file.write(response + '\n')
    except Exception as e:
        logging.error(f'Failed to save to file {output_file}: {str(e)}')
        raise

def log_to_file(log_file, message):
    """Log messages to a log file."""
    try:
        with open(log_file, 'a') as file:
            file.write(message + '\n')
    except Exception as e:
        logging.error(f'Failed to log to file {log_file}: {str(e)}')
        raise

def call_openai_api(chunk, model, max_tokens, temperature, prompt):
    """Call the OpenAI API with retries on rate limits."""
    for i in range(3):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": chunk},
                ],
                max_tokens=max_tokens,
                n=1,
                temperature=temperature,
            )
            return response.choices[0].message.content.strip()
        except openai.OpenAIError as e:
            if 'Rate limit' in str(e):
                wait_time = (2 ** i) + random.random()  # Exponential backoff with jitter
                logging.warning(f'Rate limit exceeded. Retrying after {wait_time} seconds.')
                time.sleep(wait_time)
            else:
                logging.error(f'API call failed: {str(e)}')
                return None
    logging.error('Failed to call OpenAI API after multiple retries due to rate limiting.')
    return None

def split_into_chunks(text, model, tokens=3500):
    """Split the text into smaller chunks based on token limits."""
    encoding = tiktoken.encoding_for_model(model)
    words = encoding.encode(text)
    chunks = []
    for i in range(0, len(words), tokens):
        chunks.append(''.join(encoding.decode(words[i:i + tokens])))
    return chunks

def process_chunks(input_file, output_file, log_file, model, chunksize, max_tokens, temperature, prompt):
    """Process text chunks and call OpenAI API for each chunk."""
    initialize_files(output_file, log_file)
    text = load_text(input_file)
    chunks = split_into_chunks(text, model, tokens=chunksize)
    nCh = len(chunks)
    print(f'{nCh} chunks.')
    log_to_file(log_file, f'Number of chunks: {nCh}')
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(call_openai_api, chunk, model, max_tokens, temperature, prompt): chunk for chunk in chunks}
        responses = []
        for future in tqdm(as_completed(futures), total=len(futures), desc='Processing chunks'):
            response = future.result()
            if response is None:
                log_to_file(log_file, f'Failed to process chunk {futures[future]}')
            else:
                responses.append(response)
                log_to_file(log_file, 'Successfully processed chunk!')
    save_to_file(responses, output_file)

if __name__ == "__main__":
    input_file = 'input.txt'
    output_file = 'output.txt'
    log_file = 'log.txt'
    model = 'gpt-3.5-turbo'
    chunksize = 3500
    max_tokens = 4000
    temperature = 0.01
    prompt = '''You will be presented with a scrambled, poorly formatted BibTeX entry. 
    Your task is to refactor the entry to fix all syntax problems and ensure it adheres to the standard BibTeX format. 
    Here is an example of a properly formatted BibTeX entry: 
    @article{Smith2023, author = {Smith, John and Doe, Jane}, title = {The Impact of Artificial Intelligence on Society}, journal = {Journal of Artificial Intelligence Research}, year = {2023}, volume = {10}, number = {2}, pages = {123--145}, doi = {10.1234/jair.2023.10.2.123}, abstract = {This paper explores the profound impact of artificial intelligence (AI) on various aspects of society. We discuss the ethical implications, economic consequences, and potential societal benefits of AI. Our analysis highlights the need for responsible AI development and deployment to mitigate risks and maximize its positive impact.}, keywords = {Artificial Intelligence, Society, Ethics, Economics, Impact} } 
    Here is the scrambled BibTeX data you need to refactor: <scrambled_bibtex>  </scrambled_bibtex> 
    To refactor the scrambled BibTeX data, follow these steps: 
    1. Check for missing or incorrect delimiters, such as curly braces, commas, and equals signs. Ensure that each field is properly enclosed in curly braces and that key-value pairs are separated by commas. 
    2. Ensure proper indentation and line breaks. Each field should start on a new line and be indented with two spaces. 
    3. Verify the presence and correct order of required fields, such as author, title, and year. Optional fields can be included as needed. 
    4. Check for proper capitalization and punctuation within fields. Titles should be capitalized appropriately, and punctuation should be consistent. 
    5. If any information is missing or unclear, note this in your response. 
    Please provide the refactored BibTeX entry inside <refactored_bibtex> tags.'''
    process_chunks(input_file, output_file, log_file, model, chunksize, max_tokens, temperature, prompt)
URL: https://ib.bsb.br/bulk-gpt