Convert everything to readable PDFs

  #!/bin/bash
# enhanced_batch_convert_to_pdf.sh
# Converts diverse files to PDFs with readable text, metadata, or hex dumps.

# --- Configuration ---
# Optional: Set a common path prefix to strip from input file paths when creating output subdirectories.
# If your files are in /mnt/data/project1/docs and you set this to /mnt/data/project1,
# output for /mnt/data/project1/docs/file.txt will be in converted_pdfs/docs/file.pdf.
# If empty or not set, full paths (minus leading /) will be used for subdirectory structure.
COMMON_PREFIX_TO_STRIP="/mnt/mSATA/linaro/Desktop/00-TEMP/TCC/unique"

# How to handle binary/unconvertible files:
# "metadata": Create a PDF with file info (name, type, size).
# "hex": Create a PDF with a hex dump of the file.
# "strings": Create a PDF with printable strings from the file.
DEFAULT_BINARY_HANDLING="strings"

# Force OCR on all existing PDFs, even if they seem to have a text layer.
# If false, OCRs only if no text layer is detected or if it's an image-to-PDF conversion.
FORCE_OCR_ALL_EXISTING_PDFS=false

# For images converted to PDF, should OCR be attempted?
OCR_IMAGES_TO_PDF=true

OUTPUT_DIR_BASE="converted_pdfs" # All output will go into subdirectories here
LOG_FILE="conversion_log_enhanced.txt"

# --- Helper Functions ---

# Function to log messages
log_msg() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}

# Check for required commands
check_commands() {
    local missing_cmds=0
    local cmds_to_check=(
        "file" "libreoffice" "pandoc" "pdflatex" "convert" "jq" "enscript"
        "ps2pdf" "pdffonts" "ocrmypdf" "xxd" "man" "realpath" "mktemp" "dirname" "basename"
    )
    log_msg "INFO: Checking for required commands..."
    for cmd in "${cmds_to_check[@]}"; do
        if ! command -v "$cmd" &> /dev/null; then
            log_msg "ERROR: Required command '$cmd' not found. Please install it."
            echo "ERROR: Required command '$cmd' not found. Please install it." >&2
            missing_cmds=$((missing_cmds + 1))
        fi
    done
    if [[ $missing_cmds -gt 0 ]]; then
        log_msg "FATAL: $missing_cmds required command(s) are missing. Exiting."
        echo "FATAL: $missing_cmds required command(s) are missing. Exiting." >&2
        exit 1
    fi
    log_msg "INFO: All required commands found."
}

# Check for embedded text in PDF
has_text_layer() {
    # Returns 0 if text layer exists, 1 if not or error
    if ! pdffonts "$1" &>/dev/null; then return 1; fi # pdffonts error
    if [[ $(pdffonts "$1" | awk 'NR>2 {if ($NF != "no") c++} END{print c+0}') -gt 0 ]]; then
        return 0 # Has text
    else
        return 1 # No text
    fi
}

# Normalize extension to lowercase
normalize_ext() {
    local filename=$(basename "$1")
    local ext="${filename##*.}"
    if [[ "$ext" == "$filename" ]]; then # No extension
        echo ""
    else
        echo "${ext}" | tr '[:upper:]' '[:lower:]'
    fi
}

# Create placeholder PDF with metadata
create_metadata_pdf() {
    local infile="$1"
    local outfile="$2"
    local detected_mimetype="$3"
    local file_description="$4"
    local reason="$5"
    local filesize=$(du -b "$infile" | cut -f1) # Size in bytes

    log_msg "INFO: Creating metadata PDF for '$infile'. Reason: $reason"
    (
        echo "File Information"
        echo "----------------"
        echo "Original Filename: $(basename "$infile")"
        echo "Full Path: $infile"
        echo "Detected MIME Type: $detected_mimetype"
        echo "File Command Description: $file_description"
        echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
        echo "Modification Date: $(date -r "$infile")"
        echo ""
        echo "Reason for this Metadata PDF:"
        echo "$reason"
        echo "The content of the original file could not be meaningfully rendered as a standard document."
    ) | enscript -B --font=Courier10 --word-wrap --margins=50:50:50:50 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1
    if [[ $? -eq 0 && -s "$outfile" ]]; then
        log_msg "OK: Metadata PDF created for '$infile' at '$outfile'."
    else
        log_msg "ERROR: Metadata PDF creation FAILED for '$infile'."
    fi
}

# Create hex dump PDF
create_hexdump_pdf() {
    local infile="$1"
    local outfile="$2"
    local detected_mimetype="$3"
    local file_description="$4"
    local filesize=$(du -b "$infile" | cut -f1)

    log_msg "INFO: Creating hex dump PDF for '$infile'."
    (
        echo "File Information & Hex Dump"
        echo "---------------------------"
        echo "Original Filename: $(basename "$infile")"
        echo "Full Path: $infile"
        echo "Detected MIME Type: $detected_mimetype"
        echo "File Command Description: $file_description"
        echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
        echo "Modification Date: $(date -r "$infile")"
        echo ""
        echo "Hex Dump (first 1MB or full file if smaller):"
        xxd -l 1048576 "$infile" # Limit to 1MB to avoid huge PDFs
    ) | enscript -B --font=Courier8 --word-wrap --margins=50:50:50:50 -r -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 # -r for landscape
     if [[ $? -eq 0 && -s "$outfile" ]]; then
        log_msg "OK: Hex dump PDF created for '$infile' at '$outfile'."
    else
        log_msg "ERROR: Hex dump PDF creation FAILED for '$infile'."
    fi
}

# Create strings PDF
create_strings_pdf() {
    local infile="$1"
    local outfile="$2"
    local detected_mimetype="$3"
    local file_description="$4"
    local filesize=$(du -b "$infile" | cut -f1)

    log_msg "INFO: Creating extracted strings PDF for '$infile'."
     (
        echo "File Information & Extracted Strings"
        echo "------------------------------------"
        echo "Original Filename: $(basename "$infile")"
        echo "Full Path: $infile"
        echo "Detected MIME Type: $detected_mimetype"
        echo "File Command Description: $file_description"
        echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
        echo "Modification Date: $(date -r "$infile")"
        echo ""
        echo "Extracted Printable Strings (UTF-8, min length 4):"
        strings -n 4 -a -t d --encoding=S "$infile" # Show offset, include all file
    ) | enscript -B --font=Courier10 --word-wrap --margins=50:50:50:50 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1
    if [[ $? -eq 0 && -s "$outfile" ]]; then
        log_msg "OK: Strings PDF created for '$infile' at '$outfile'."
    else
        log_msg "ERROR: Strings PDF creation FAILED for '$infile'."
    fi
}


# --- Main Conversion Function ---
convert_file() {
    local infile="$1"
    local binary_handling_method="$2"
    local conversion_done=false

    # Determine output path
    local relative_path_to_input="$infile"
    if [[ -n "$COMMON_PREFIX_TO_STRIP" ]]; then
        # Ensure prefix ends with / if it's not empty and doesn't have one, for clean stripping
        local temp_prefix="$COMMON_PREFIX_TO_STRIP"
        [[ "${temp_prefix: -1}" != "/" && -n "$temp_prefix" ]] && temp_prefix="$temp_prefix/"
        
        # Strip prefix if infile starts with it
        if [[ "$infile" == "$temp_prefix"* ]]; then
             relative_path_to_input="${infile#"$temp_prefix"}"
        else # Prefix not found, use infile as is (minus leading / for safety with mkdir -p)
            relative_path_to_input="${infile#/}"
        fi
    else # No prefix to strip, use infile as is (minus leading /)
        relative_path_to_input="${infile#/}"
    fi
    
    local out_subdir="$OUTPUT_DIR_BASE/$relative_path_to_input"
    out_subdir=$(dirname "$out_subdir") # Get directory part for output
    mkdir -p "$out_subdir"

    local in_filename=$(basename "$infile")
    local in_base="${in_filename%.*}"
    # If filename has no extension, in_base becomes in_filename
    if [[ "$in_filename" == "$in_base" ]]; then
        in_base="$in_filename"
    fi
    local outfile="$out_subdir/$in_base.pdf"
    
    # Handle cases where infile itself is already $outfile (e.g. input is a.pdf, output is a.pdf)
    # or if infile is foo and outfile becomes foo.pdf, this is fine.
    # If infile is foo.txt and outfile is foo.pdf, this is fine.
    # If infile is foo.pdf and outfile is foo.pdf, this is fine.
    # The main concern is overwriting source if $infile == $outfile AND it's not a PDF already.
    # This is unlikely given $outfile always gets .pdf extension.
    # However, if $infile is /path/foo and $outfile is /path/foo.pdf, this is the desired outcome.

    local ext=$(normalize_ext "$in_filename")
    local mimetype=$(file -b --mime-type "$infile" | cut -d';' -f1) # Remove charset
    local filedesc=$(file -b "$infile")

    log_msg "-----------------------------------------------------"
    log_msg "START Processing: '$infile'"
    log_msg "INFO: MIME='$mimetype', Ext='$ext', Desc='$filedesc', OutFile='$outfile'"

    if [[ -f "$outfile" && "$outfile" -nt "$infile" ]]; then
        log_msg "SKIP: Output '$outfile' exists and is newer than '$infile'."
        return 0 # Indicate skipped
    fi

    case "$mimetype" in
        application/pdf)
            if $FORCE_OCR_ALL_EXISTING_PDFS; then
                log_msg "INFO: PDF '$infile' - OCR FORCED."
                ocrmypdf --force-ocr "$infile" "$outfile" >> "$LOG_FILE" 2>&1
            elif ! has_text_layer "$infile"; then
                log_msg "INFO: PDF '$infile' needs OCR (no text layer detected)."
                ocrmypdf "$infile" "$outfile" >> "$LOG_FILE" 2>&1 # Default: adds layer if missing
            else
                log_msg "INFO: PDF '$infile' has text layer. Copying."
                cp "$infile" "$outfile" >> "$LOG_FILE" 2>&1
                if [[ $? -eq 0 ]]; then conversion_done=true; else log_msg "ERROR: Failed to copy PDF '$infile'."; fi
            fi
            if [[ -f "$outfile" && $? -eq 0 ]]; then # Check if ocrmypdf or cp succeeded
                log_msg "OK: PDF '$infile' processed to '$outfile'."
                conversion_done=true
            elif [[ ! -f "$outfile" ]]; then # If ocrmypdf failed and didn't create outfile
                log_msg "ERROR: Processing PDF '$infile' failed. Output file not created."
            fi
            ;;

        application/msword|application/vnd.ms-word*|\
        application/vnd.openxmlformats-officedocument.wordprocessingml.document|\
        application/vnd.oasis.opendocument.text*|application/rtf|\
        application/vnd.ms-excel*|application/vnd.openxmlformats-officedocument.spreadsheetml.sheet|\
        application/vnd.oasis.opendocument.spreadsheet*)
            log_msg "INFO: Office document '$infile'. Converting with LibreOffice."
            # LibreOffice --convert-to pdf uses the input filename with .pdf extension in the --outdir
            local lo_expected_out_name="$out_subdir/$in_base.pdf" # This should match $outfile
            libreoffice --headless --convert-to pdf "$infile" --outdir "$out_subdir" >> "$LOG_FILE" 2>&1
            if [[ -f "$lo_expected_out_name" ]]; then
                 # If $lo_expected_out_name is different from $outfile (e.g. due to sanitization or complex base name)
                 # This should not happen if $outfile is correctly constructed as $out_subdir/$in_base.pdf
                if [[ "$lo_expected_out_name" != "$outfile" ]]; then
                    log_msg "WARN: LibreOffice output '$lo_expected_out_name' differs from expected '$outfile'. Moving."
                    mv "$lo_expected_out_name" "$outfile" >> "$LOG_FILE" 2>&1
                fi
                log_msg "OK: '$infile' converted via LibreOffice to '$outfile'."
                conversion_done=true
            else
                log_msg "ERROR: LibreOffice conversion FAILED for '$infile'. Output '$lo_expected_out_name' not found."
            fi
            ;;

        text/csv|text/tab-separated-values)
            log_msg "INFO: CSV/TSV '$infile'. Converting with Pandoc."
            pandoc "$infile" -o "$outfile" --from=csv --toc --standalone >> "$LOG_FILE" 2>&1 \
                && { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
                || log_msg "ERROR: Pandoc (CSV) FAILED for '$infile'."
            ;;

        text/markdown)
            log_msg "INFO: Markdown '$infile'. Converting with Pandoc."
            pandoc "$infile" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
                && { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
                || log_msg "ERROR: Pandoc (Markdown) FAILED for '$infile'."
            ;;
        
        application/json)
            log_msg "INFO: JSON '$infile'. Attempting pretty-print with jq then Pandoc."
            local tmp_json_pretty=$(mktemp "$out_subdir/json_pretty_XXXXXX.json")
            if jq . "$infile" > "$tmp_json_pretty" 2>/dev/null; then
                pandoc "$tmp_json_pretty" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' (pretty JSON) converted via Pandoc."; conversion_done=true; } \
                    || { log_msg "ERROR: Pandoc (pretty JSON) FAILED for '$infile'. Trying enscript.";
                         enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                            && { log_msg "OK: '$infile' (JSON) converted via enscript fallback."; conversion_done=true; } \
                            || log_msg "ERROR: enscript fallback for JSON '$infile' FAILED."; }
            else
                log_msg "WARN: jq failed for '$infile'. Trying Pandoc on raw, then enscript."
                pandoc "$infile" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' (raw JSON) converted via Pandoc."; conversion_done=true; } \
                    || { log_msg "ERROR: Pandoc (raw JSON) FAILED for '$infile'. Trying enscript.";
                         enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                            && { log_msg "OK: '$infile' (JSON) converted via enscript fallback."; conversion_done=true; } \
                            || log_msg "ERROR: enscript fallback for JSON '$infile' FAILED."; }
            fi
            rm -f "$tmp_json_pretty"
            ;;

        image/png|image/jpeg|image/gif|image/bmp|image/tiff|image/webp)
            log_msg "INFO: Image '$infile'."
            if $OCR_IMAGES_TO_PDF && command -v ocrmypdf &> /dev/null; then
                log_msg "Attempting OCR with ocrmypdf for image '$infile'."
                ocrmypdf "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: Image '$infile' OCR'd via ocrmypdf to '$outfile'."; conversion_done=true; } \
                    || { log_msg "ERROR: ocrmypdf FAILED for image '$infile'. Falling back to ImageMagick's convert.";
                         convert "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
                            && { log_msg "OK: Image '$infile' converted via ImageMagick (no OCR)."; conversion_done=true; } \
                            || log_msg "ERROR: ImageMagick's convert also FAILED for '$infile'."; }
            else
                log_msg "Converting image '$infile' with ImageMagick's convert (no OCR)."
                convert "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: Image '$infile' converted via ImageMagick."; conversion_done=true; } \
                    || log_msg "ERROR: ImageMagick's convert FAILED for '$infile'."
            fi
            ;;

        application/x-tex|text/x-tex|application/x-latex)
            if [[ "$ext" == "cls" || "$ext" == "sty" ]]; then # LaTeX class/style files
                log_msg "INFO: LaTeX Class/Style file '$infile'. Converting as syntax-highlighted text."
                pandoc "$infile" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' converted via Pandoc (as LaTeX source)."; conversion_done=true; } \
                    || { log_msg "ERROR: Pandoc (LaTeX source) FAILED for '$infile'. Trying enscript.";
                         enscript "$infile" --font=Courier10 --highlight=latex -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                            && { log_msg "OK: '$infile' converted via enscript (highlighted LaTeX)."; conversion_done=true; } \
                            || log_msg "ERROR: enscript (highlighted LaTeX) FAILED for '$infile'."; }
            else # Regular .tex files
                log_msg "INFO: LaTeX document '$infile'. Compiling with pdflatex."
                local temp_tex_dir=$(mktemp -d "$out_subdir/tex_compile_XXXXXX")
                cp "$infile" "$temp_tex_dir/" # Copy tex file to temp dir
                # If there are associated .bib files or images, they'd need to be copied too or paths adjusted.
                # This simplified version assumes self-contained .tex or resolvable paths from temp_tex_dir.
                
                local tex_filename_only=$(basename "$infile")
                (cd "$temp_tex_dir" && \
                 pdflatex -interaction=nonstopmode "$tex_filename_only" && \
                 pdflatex -interaction=nonstopmode "$tex_filename_only") >> "$LOG_FILE" 2>&1
                
                local compiled_pdf_base="${tex_filename_only%.*}"
                local compiled_pdf_path="$temp_tex_dir/$compiled_pdf_base.pdf"

                if [[ -f "$compiled_pdf_path" ]]; then
                    mv "$compiled_pdf_path" "$outfile" >> "$LOG_FILE" 2>&1 \
                        && { log_msg "OK: '$infile' compiled via pdflatex to '$outfile'."; conversion_done=true; } \
                        || log_msg "ERROR: pdflatex compiled '$infile', but FAILED to move to '$outfile'."
                else
                    log_msg "ERROR: pdflatex compilation FAILED for '$infile'. Output PDF not found in '$temp_tex_dir'."
                fi
                rm -rf "$temp_tex_dir"
            fi
            ;;
        
        application/x-bibtex|text/x-bibtex)
            log_msg "INFO: BibTeX file '$infile'. Converting with Pandoc."
            pandoc "$infile" --standalone -o "$outfile" >> "$LOG_FILE" 2>&1 \
                && { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
                || { log_msg "ERROR: Pandoc (BibTeX) FAILED for '$infile'. Trying enscript.";
                     enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                        && { log_msg "OK: '$infile' (BibTeX) converted as plain text via enscript."; conversion_done=true; } \
                        || log_msg "ERROR: enscript fallback for BibTeX '$infile' FAILED."; }
            ;;

        application/x-troff-man|text/troff) # Man pages
            log_msg "INFO: Man page '$infile'. Converting with 'man'."
            man -Tpdf "$infile" > "$outfile" 2>> "$LOG_FILE" # man outputs errors to stderr
            if [[ $? -eq 0 && -s "$outfile" ]]; then
                log_msg "OK: '$infile' converted via 'man -Tpdf'."
                conversion_done=true
            else
                log_msg "WARN: 'man -Tpdf' FAILED for '$infile'. Trying Pandoc."
                pandoc "$infile" --standalone -f man -t pdf -o "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' converted via Pandoc (man)."; conversion_done=true; } \
                    || { log_msg "ERROR: Pandoc (man) FAILED for '$infile'. Trying enscript.";
                         enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                            && { log_msg "OK: '$infile' (man page) converted as text via enscript."; conversion_done=true; } \
                            || log_msg "ERROR: enscript fallback for man page '$infile' FAILED."; }
            fi
            ;;

        text/x-python|text/x-shellscript|application/x-perl|application/x-ruby|\
        text/x-csrc|text/x-chdr|text/x-c++src|text/x-java|text/html|text/css|application/javascript|application/xml|text/xml) # Code, XML, HTML
            log_msg "INFO: Code/Markup file '$infile' ($mimetype). Converting with Pandoc (syntax highlighting)."
            # Determine Pandoc format based on extension if possible for better highlighting
            local pandoc_format_opt=""
            case "$ext" in
                py) pandoc_format_opt="python" ;;
                sh|bash) pandoc_format_opt="bash" ;;
                pl) pandoc_format_opt="perl" ;;
                rb) pandoc_format_opt="ruby" ;;
                c|h) pandoc_format_opt="c" ;;
                cpp|hpp|cxx) pandoc_format_opt="cpp" ;;
                java) pandoc_format_opt="java" ;;
                html|htm) pandoc_format_opt="html" ;;
                css) pandoc_format_opt="css" ;;
                js) pandoc_format_opt="javascript" ;;
                xml|bcf|run.xml) pandoc_format_opt="xml" ;; # .bcf (Biber control file), .run.xml
            esac
            
            if [[ -n "$pandoc_format_opt" ]]; then
                pandoc "$infile" --from="$pandoc_format_opt" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1
            else # Default to Pandoc auto-detection or plain text
                pandoc "$infile" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1
            fi

            if [[ $? -eq 0 && -s "$outfile" ]]; then
                log_msg "OK: '$infile' converted via Pandoc with highlighting."
                conversion_done=true
            else
                log_msg "WARN: Pandoc with highlighting FAILED for '$infile'. Trying enscript."
                local enscript_hl_opt=""
                [[ -n "$ext" ]] && enscript_hl_opt="--highlight=$ext"
                enscript "$infile" --font=Courier10 $enscript_hl_opt -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' converted via enscript."; conversion_done=true; } \
                    || log_msg "ERROR: enscript fallback for '$infile' FAILED."
            fi
            ;;
        
        text/*) # Generic text files (.log, .txt, .bak, .aux, .synctex, .blg, .info, .lst, .conf etc.)
            log_msg "INFO: Generic text file '$infile' ($mimetype, ext: .$ext). Converting with enscript."
            enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                && { log_msg "OK: '$infile' converted via enscript."; conversion_done=true; } \
                || log_msg "ERROR: enscript FAILED for '$infile'."
            ;;

        application/octet-stream|application/x-dosexec|application/x-sharedlib|\
        application/x-object|application/x-executable|application/x-sqlite3|inode/x-empty|\
        application/x-archive|application/zip|application/gzip|application/x-bzip2|application/x-xz)
            log_msg "INFO: Binary/Archive/Empty/Unknown file '$infile' (MIME: $mimetype, Desc: $filedesc)."
            if [[ "$mimetype" == "inode/x-empty" ]]; then
                create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "File is empty."
                conversion_done=true
            # Heuristic: if 'filedesc' suggests text despite octet-stream, try enscript
            elif [[ "$filedesc" == *"text"* || "$filedesc" == *"script"* || "$filedesc" == *"ASCII text"* || "$filedesc" == *"UTF-8 Unicode text"* ]]; then
                log_msg "INFO: MIME is '$mimetype', but filedesc suggests text ('$filedesc'). Trying enscript."
                enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' converted via enscript (heuristic)."; conversion_done=true; } \
                    || { log_msg "ERROR: enscript (heuristic) FAILED for '$infile'. Using binary handling.";
                         if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                         elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                         else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Binary or undetermined content type ($mimetype)."; fi
                         conversion_done=true; # Placeholder PDF is a form of "done"
                       }
            else # Standard binary handling
                if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Binary, archive, or undetermined content type ($mimetype)."; fi
                conversion_done=true; # Placeholder PDF is a form of "done"
            fi
            ;;
        
        *) # Fallback for truly unrecognized/unhandled MIME types
            log_msg "WARN: Unhandled MIME type '$mimetype' for '$infile'. File description: '$filedesc'."
            if [[ "$filedesc" == *"text"* || "$filedesc" == *"script"* || "$filedesc" == *"ASCII text"* || "$filedesc" == *"UTF-8 Unicode text"* ]]; then
                log_msg "INFO: Unhandled MIME, but filedesc suggests text ('$filedesc'). Trying enscript."
                enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
                    && { log_msg "OK: '$infile' converted via enscript (heuristic for unhandled MIME)."; conversion_done=true; } \
                    || { log_msg "ERROR: enscript (heuristic for unhandled MIME) FAILED for '$infile'. Using binary handling.";
                         if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                         elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                         else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Unhandled MIME type ($mimetype) and not clearly text."; fi
                         conversion_done=true;
                       }
            else
                log_msg "INFO: Unhandled MIME type '$mimetype' for '$infile'. Using configured binary handling."
                if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
                else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Unhandled MIME type ($mimetype)."; fi
                conversion_done=true;
            fi
            ;;
    esac

    if ! $conversion_done && [[ ! -f "$outfile" ]]; then
        log_msg "FALLBACK: No conversion method succeeded and no output file created for '$infile'. Creating placeholder."
        create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "No applicable conversion rule or all attempts failed."
    elif ! $conversion_done && [[ -f "$outfile" ]]; then
        log_msg "WARN: Conversion logic did not explicitly set 'done' flag, but output file '$outfile' exists. Assuming prior step handled it."
    fi
    log_msg "END Processing: '$infile'"
    [[ -f "$outfile" ]] && return 0 || return 1 # Return 0 if output exists, 1 otherwise
}

# --- Script Main Logic ---

# Argument Parsing
if [[ "$#" -lt 1 || "$#" -gt 4 ]]; then
    echo "Usage: $0 <listfile.txt> [common_prefix_to_strip] [binary_handling: metadata|hex|strings] [force_ocr_all_pdfs: true|false]"
    echo "Example: $0 myfiles.txt \"/mnt/mydata\" hex true"
    exit 1
fi

INPUT_LIST_FILE="$1"
[[ -n "$2" ]] && COMMON_PREFIX_TO_STRIP="$2"
[[ -n "$3" ]] && BINARY_HANDLING_USER_CHOICE="$3" || BINARY_HANDLING_USER_CHOICE="$DEFAULT_BINARY_HANDLING"
[[ -n "$4" ]] && FORCE_OCR_ALL_PDFS_USER_CHOICE="$4"

# Validate binary handling choice
case "$BINARY_HANDLING_USER_CHOICE" in
    metadata|hex|strings) BINARY_HANDLING="$BINARY_HANDLING_USER_CHOICE" ;;
    *) log_msg "WARN: Invalid binary_handling option '$BINARY_HANDLING_USER_CHOICE'. Defaulting to '$DEFAULT_BINARY_HANDLING'."; BINARY_HANDLING="$DEFAULT_BINARY_HANDLING" ;;
esac

# Validate force OCR choice
if [[ -n "$FORCE_OCR_ALL_PDFS_USER_CHOICE" ]]; then
    if [[ "$FORCE_OCR_ALL_PDFS_USER_CHOICE" == "true" ]]; then
        FORCE_OCR_ALL_EXISTING_PDFS=true
    elif [[ "$FORCE_OCR_ALL_PDFS_USER_CHOICE" == "false" ]]; then
        FORCE_OCR_ALL_EXISTING_PDFS=false
    else
        log_msg "WARN: Invalid force_ocr_all_pdfs option '$FORCE_OCR_ALL_PDFS_USER_CHOICE'. Using default ($FORCE_OCR_ALL_EXISTING_PDFS)."
    fi
fi

# Initialize Log File
echo "Conversion process started at $(date)" > "$LOG_FILE"
log_msg "INFO: Input List File: '$INPUT_LIST_FILE'"
log_msg "INFO: Common Prefix to Strip: '$COMMON_PREFIX_TO_STRIP'"
log_msg "INFO: Binary File Handling: '$BINARY_HANDLING'"
log_msg "INFO: Force OCR All Existing PDFs: $FORCE_OCR_ALL_EXISTING_PDFS"
log_msg "INFO: OCR Images to PDF: $OCR_IMAGES_TO_PDF"
log_msg "INFO: Output Base Directory: '$OUTPUT_DIR_BASE'"

check_commands # Check for essential tools

# Prepare clean file list (robust URL decoding)
CLEANED_LIST_FOR_PROCESSING_INTERNAL="cleaned_input_list_internal.tmp"
log_msg "INFO: Preparing clean file list from '$INPUT_LIST_FILE'..."
if python3 -c 'import sys, urllib.parse; [print(urllib.parse.unquote(line.strip())) for line in sys.stdin if line.strip()]' < "$INPUT_LIST_FILE" > "$CLEANED_LIST_FOR_PROCESSING_INTERNAL"; then
    log_msg "INFO: Clean file list created successfully using Python."
else
    log_msg "WARN: Python URL decoding failed (Python 3 not found or error). Using original list (may have issues with URL-encoded names)."
    cp "$INPUT_LIST_FILE" "$CLEANED_LIST_FOR_PROCESSING_INTERNAL" # Fallback
fi

# Create output base directory
mkdir -p "$OUTPUT_DIR_BASE"
log_msg "INFO: Ensured output base directory '$OUTPUT_DIR_BASE' exists."

# Process the list
total_files=0
successful_conversions=0
failed_conversions=0
skipped_up_to_date=0

while IFS= read -r file_to_process || [[ -n "$file_to_process" ]]; do
    # Skip empty or comment lines from the (cleaned) list
    [[ -z "$file_to_process" || "$file_to_process" =~ ^# ]] && continue

    total_files=$((total_files + 1))
    if [[ ! -e "$file_to_process" ]]; then # Use -e to check if path exists (file or dir)
        log_msg "ERROR: File or directory '$file_to_process' from list NOT FOUND. Skipping."
        failed_conversions=$((failed_conversions + 1))
        continue
    fi
    if [[ -d "$file_to_process" ]]; then
        log_msg "SKIP: Path '$file_to_process' is a DIRECTORY. Skipping."
        # Consider if directories should be counted as skipped or failed. For now, just log.
        continue
    fi
     if [[ ! -f "$file_to_process" ]]; then
        log_msg "SKIP: Path '$file_to_process' is NOT A REGULAR FILE. Skipping."
        continue
    fi
    if [[ ! -r "$file_to_process" ]]; then
        log_msg "ERROR: File '$file_to_process' is NOT READABLE. Skipping."
        failed_conversions=$((failed_conversions + 1))
        continue
    fi

    # The up-to-date check is now inside convert_file, which returns 0 for success/skipped-up-to-date
    convert_file "$file_to_process" "$BINARY_HANDLING"
    status=$?
    
    # Crude status check based on return value (0 for success/skipped, 1 for failure)
    # A more refined check would involve parsing the log for "SKIP" vs "OK"
    if [[ $status -eq 0 ]]; then
        # This counts files where an output PDF was created OR skipped because it was up-to-date.
        # To differentiate, we'd need more complex return codes or log parsing.
        # For now, if convert_file returns 0, it means no critical error in its own execution.
        # The actual "success" of conversion is in the log.
        : # Not incrementing success here, summary is tricky.
    else
        failed_conversions=$((failed_conversions + 1))
    fi
done < "$CLEANED_LIST_FOR_PROCESSING_INTERNAL"

rm -f "$CLEANED_LIST_FOR_PROCESSING_INTERNAL" # Clean up temp list

# Final Summary (approximated from log counts for more detail)
successful_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - OK:" "$LOG_FILE")
error_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - ERROR:" "$LOG_FILE")
skipped_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - SKIP:" "$LOG_FILE") # Includes up-to-date and non-file skips

log_msg "-----------------------------------------------------"
log_msg "Conversion process completed at $(date)"
log_msg "SUMMARY: Total items from list processed: $total_files"
log_msg "SUMMARY: Successful operations (OK): $successful_ops"
log_msg "SUMMARY: Errored operations (ERROR): $error_ops"
log_msg "SUMMARY: Skipped operations (SKIP/NOT FOUND/DIR): $skipped_ops (includes up-to-date, not found, directories)"
log_msg "INFO: Detailed log written to '$LOG_FILE'"
log_msg "INFO: Output PDFs are in subdirectories under '$OUTPUT_DIR_BASE'"

echo "Conversion complete. Log: $LOG_FILE. Output in $OUTPUT_DIR_BASE."
URL: https://ib.bsb.br/convert-everything-to-readable-pdfs