#!/bin/bash
# enhanced_batch_convert_to_pdf.sh
# Converts diverse files to PDFs with readable text, metadata, or hex dumps.
# --- Configuration ---
# Optional: Set a common path prefix to strip from input file paths when creating output subdirectories.
# If your files are in /mnt/data/project1/docs and you set this to /mnt/data/project1,
# output for /mnt/data/project1/docs/file.txt will be in converted_pdfs/docs/file.pdf.
# If empty or not set, full paths (minus leading /) will be used for subdirectory structure.
COMMON_PREFIX_TO_STRIP="/mnt/mSATA/linaro/Desktop/00-TEMP/TCC/unique"
# How to handle binary/unconvertible files:
# "metadata": Create a PDF with file info (name, type, size).
# "hex": Create a PDF with a hex dump of the file.
# "strings": Create a PDF with printable strings from the file.
DEFAULT_BINARY_HANDLING="strings"
# Force OCR on all existing PDFs, even if they seem to have a text layer.
# If false, OCRs only if no text layer is detected or if it's an image-to-PDF conversion.
FORCE_OCR_ALL_EXISTING_PDFS=false
# For images converted to PDF, should OCR be attempted?
OCR_IMAGES_TO_PDF=true
OUTPUT_DIR_BASE="converted_pdfs" # All output will go into subdirectories here
LOG_FILE="conversion_log_enhanced.txt"
# --- Helper Functions ---
# Function to log messages
log_msg() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}
# Check for required commands
check_commands() {
local missing_cmds=0
local cmds_to_check=(
"file" "libreoffice" "pandoc" "pdflatex" "convert" "jq" "enscript"
"ps2pdf" "pdffonts" "ocrmypdf" "xxd" "man" "realpath" "mktemp" "dirname" "basename"
)
log_msg "INFO: Checking for required commands..."
for cmd in "${cmds_to_check[@]}"; do
if ! command -v "$cmd" &> /dev/null; then
log_msg "ERROR: Required command '$cmd' not found. Please install it."
echo "ERROR: Required command '$cmd' not found. Please install it." >&2
missing_cmds=$((missing_cmds + 1))
fi
done
if [[ $missing_cmds -gt 0 ]]; then
log_msg "FATAL: $missing_cmds required command(s) are missing. Exiting."
echo "FATAL: $missing_cmds required command(s) are missing. Exiting." >&2
exit 1
fi
log_msg "INFO: All required commands found."
}
# Check for embedded text in PDF
has_text_layer() {
# Returns 0 if text layer exists, 1 if not or error
if ! pdffonts "$1" &>/dev/null; then return 1; fi # pdffonts error
if [[ $(pdffonts "$1" | awk 'NR>2 {if ($NF != "no") c++} END{print c+0}') -gt 0 ]]; then
return 0 # Has text
else
return 1 # No text
fi
}
# Normalize extension to lowercase
normalize_ext() {
local filename=$(basename "$1")
local ext="${filename##*.}"
if [[ "$ext" == "$filename" ]]; then # No extension
echo ""
else
echo "${ext}" | tr '[:upper:]' '[:lower:]'
fi
}
# Create placeholder PDF with metadata
create_metadata_pdf() {
local infile="$1"
local outfile="$2"
local detected_mimetype="$3"
local file_description="$4"
local reason="$5"
local filesize=$(du -b "$infile" | cut -f1) # Size in bytes
log_msg "INFO: Creating metadata PDF for '$infile'. Reason: $reason"
(
echo "File Information"
echo "----------------"
echo "Original Filename: $(basename "$infile")"
echo "Full Path: $infile"
echo "Detected MIME Type: $detected_mimetype"
echo "File Command Description: $file_description"
echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
echo "Modification Date: $(date -r "$infile")"
echo ""
echo "Reason for this Metadata PDF:"
echo "$reason"
echo "The content of the original file could not be meaningfully rendered as a standard document."
) | enscript -B --font=Courier10 --word-wrap --margins=50:50:50:50 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1
if [[ $? -eq 0 && -s "$outfile" ]]; then
log_msg "OK: Metadata PDF created for '$infile' at '$outfile'."
else
log_msg "ERROR: Metadata PDF creation FAILED for '$infile'."
fi
}
# Create hex dump PDF
create_hexdump_pdf() {
local infile="$1"
local outfile="$2"
local detected_mimetype="$3"
local file_description="$4"
local filesize=$(du -b "$infile" | cut -f1)
log_msg "INFO: Creating hex dump PDF for '$infile'."
(
echo "File Information & Hex Dump"
echo "---------------------------"
echo "Original Filename: $(basename "$infile")"
echo "Full Path: $infile"
echo "Detected MIME Type: $detected_mimetype"
echo "File Command Description: $file_description"
echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
echo "Modification Date: $(date -r "$infile")"
echo ""
echo "Hex Dump (first 1MB or full file if smaller):"
xxd -l 1048576 "$infile" # Limit to 1MB to avoid huge PDFs
) | enscript -B --font=Courier8 --word-wrap --margins=50:50:50:50 -r -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 # -r for landscape
if [[ $? -eq 0 && -s "$outfile" ]]; then
log_msg "OK: Hex dump PDF created for '$infile' at '$outfile'."
else
log_msg "ERROR: Hex dump PDF creation FAILED for '$infile'."
fi
}
# Create strings PDF
create_strings_pdf() {
local infile="$1"
local outfile="$2"
local detected_mimetype="$3"
local file_description="$4"
local filesize=$(du -b "$infile" | cut -f1)
log_msg "INFO: Creating extracted strings PDF for '$infile'."
(
echo "File Information & Extracted Strings"
echo "------------------------------------"
echo "Original Filename: $(basename "$infile")"
echo "Full Path: $infile"
echo "Detected MIME Type: $detected_mimetype"
echo "File Command Description: $file_description"
echo "Size: $filesize bytes ($(du -h "$infile" | cut -f1))"
echo "Modification Date: $(date -r "$infile")"
echo ""
echo "Extracted Printable Strings (UTF-8, min length 4):"
strings -n 4 -a -t d --encoding=S "$infile" # Show offset, include all file
) | enscript -B --font=Courier10 --word-wrap --margins=50:50:50:50 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1
if [[ $? -eq 0 && -s "$outfile" ]]; then
log_msg "OK: Strings PDF created for '$infile' at '$outfile'."
else
log_msg "ERROR: Strings PDF creation FAILED for '$infile'."
fi
}
# --- Main Conversion Function ---
convert_file() {
local infile="$1"
local binary_handling_method="$2"
local conversion_done=false
# Determine output path
local relative_path_to_input="$infile"
if [[ -n "$COMMON_PREFIX_TO_STRIP" ]]; then
# Ensure prefix ends with / if it's not empty and doesn't have one, for clean stripping
local temp_prefix="$COMMON_PREFIX_TO_STRIP"
[[ "${temp_prefix: -1}" != "/" && -n "$temp_prefix" ]] && temp_prefix="$temp_prefix/"
# Strip prefix if infile starts with it
if [[ "$infile" == "$temp_prefix"* ]]; then
relative_path_to_input="${infile#"$temp_prefix"}"
else # Prefix not found, use infile as is (minus leading / for safety with mkdir -p)
relative_path_to_input="${infile#/}"
fi
else # No prefix to strip, use infile as is (minus leading /)
relative_path_to_input="${infile#/}"
fi
local out_subdir="$OUTPUT_DIR_BASE/$relative_path_to_input"
out_subdir=$(dirname "$out_subdir") # Get directory part for output
mkdir -p "$out_subdir"
local in_filename=$(basename "$infile")
local in_base="${in_filename%.*}"
# If filename has no extension, in_base becomes in_filename
if [[ "$in_filename" == "$in_base" ]]; then
in_base="$in_filename"
fi
local outfile="$out_subdir/$in_base.pdf"
# Handle cases where infile itself is already $outfile (e.g. input is a.pdf, output is a.pdf)
# or if infile is foo and outfile becomes foo.pdf, this is fine.
# If infile is foo.txt and outfile is foo.pdf, this is fine.
# If infile is foo.pdf and outfile is foo.pdf, this is fine.
# The main concern is overwriting source if $infile == $outfile AND it's not a PDF already.
# This is unlikely given $outfile always gets .pdf extension.
# However, if $infile is /path/foo and $outfile is /path/foo.pdf, this is the desired outcome.
local ext=$(normalize_ext "$in_filename")
local mimetype=$(file -b --mime-type "$infile" | cut -d';' -f1) # Remove charset
local filedesc=$(file -b "$infile")
log_msg "-----------------------------------------------------"
log_msg "START Processing: '$infile'"
log_msg "INFO: MIME='$mimetype', Ext='$ext', Desc='$filedesc', OutFile='$outfile'"
if [[ -f "$outfile" && "$outfile" -nt "$infile" ]]; then
log_msg "SKIP: Output '$outfile' exists and is newer than '$infile'."
return 0 # Indicate skipped
fi
case "$mimetype" in
application/pdf)
if $FORCE_OCR_ALL_EXISTING_PDFS; then
log_msg "INFO: PDF '$infile' - OCR FORCED."
ocrmypdf --force-ocr "$infile" "$outfile" >> "$LOG_FILE" 2>&1
elif ! has_text_layer "$infile"; then
log_msg "INFO: PDF '$infile' needs OCR (no text layer detected)."
ocrmypdf "$infile" "$outfile" >> "$LOG_FILE" 2>&1 # Default: adds layer if missing
else
log_msg "INFO: PDF '$infile' has text layer. Copying."
cp "$infile" "$outfile" >> "$LOG_FILE" 2>&1
if [[ $? -eq 0 ]]; then conversion_done=true; else log_msg "ERROR: Failed to copy PDF '$infile'."; fi
fi
if [[ -f "$outfile" && $? -eq 0 ]]; then # Check if ocrmypdf or cp succeeded
log_msg "OK: PDF '$infile' processed to '$outfile'."
conversion_done=true
elif [[ ! -f "$outfile" ]]; then # If ocrmypdf failed and didn't create outfile
log_msg "ERROR: Processing PDF '$infile' failed. Output file not created."
fi
;;
application/msword|application/vnd.ms-word*|\
application/vnd.openxmlformats-officedocument.wordprocessingml.document|\
application/vnd.oasis.opendocument.text*|application/rtf|\
application/vnd.ms-excel*|application/vnd.openxmlformats-officedocument.spreadsheetml.sheet|\
application/vnd.oasis.opendocument.spreadsheet*)
log_msg "INFO: Office document '$infile'. Converting with LibreOffice."
# LibreOffice --convert-to pdf uses the input filename with .pdf extension in the --outdir
local lo_expected_out_name="$out_subdir/$in_base.pdf" # This should match $outfile
libreoffice --headless --convert-to pdf "$infile" --outdir "$out_subdir" >> "$LOG_FILE" 2>&1
if [[ -f "$lo_expected_out_name" ]]; then
# If $lo_expected_out_name is different from $outfile (e.g. due to sanitization or complex base name)
# This should not happen if $outfile is correctly constructed as $out_subdir/$in_base.pdf
if [[ "$lo_expected_out_name" != "$outfile" ]]; then
log_msg "WARN: LibreOffice output '$lo_expected_out_name' differs from expected '$outfile'. Moving."
mv "$lo_expected_out_name" "$outfile" >> "$LOG_FILE" 2>&1
fi
log_msg "OK: '$infile' converted via LibreOffice to '$outfile'."
conversion_done=true
else
log_msg "ERROR: LibreOffice conversion FAILED for '$infile'. Output '$lo_expected_out_name' not found."
fi
;;
text/csv|text/tab-separated-values)
log_msg "INFO: CSV/TSV '$infile'. Converting with Pandoc."
pandoc "$infile" -o "$outfile" --from=csv --toc --standalone >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
|| log_msg "ERROR: Pandoc (CSV) FAILED for '$infile'."
;;
text/markdown)
log_msg "INFO: Markdown '$infile'. Converting with Pandoc."
pandoc "$infile" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
|| log_msg "ERROR: Pandoc (Markdown) FAILED for '$infile'."
;;
application/json)
log_msg "INFO: JSON '$infile'. Attempting pretty-print with jq then Pandoc."
local tmp_json_pretty=$(mktemp "$out_subdir/json_pretty_XXXXXX.json")
if jq . "$infile" > "$tmp_json_pretty" 2>/dev/null; then
pandoc "$tmp_json_pretty" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (pretty JSON) converted via Pandoc."; conversion_done=true; } \
|| { log_msg "ERROR: Pandoc (pretty JSON) FAILED for '$infile'. Trying enscript.";
enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (JSON) converted via enscript fallback."; conversion_done=true; } \
|| log_msg "ERROR: enscript fallback for JSON '$infile' FAILED."; }
else
log_msg "WARN: jq failed for '$infile'. Trying Pandoc on raw, then enscript."
pandoc "$infile" -o "$outfile" --standalone >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (raw JSON) converted via Pandoc."; conversion_done=true; } \
|| { log_msg "ERROR: Pandoc (raw JSON) FAILED for '$infile'. Trying enscript.";
enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (JSON) converted via enscript fallback."; conversion_done=true; } \
|| log_msg "ERROR: enscript fallback for JSON '$infile' FAILED."; }
fi
rm -f "$tmp_json_pretty"
;;
image/png|image/jpeg|image/gif|image/bmp|image/tiff|image/webp)
log_msg "INFO: Image '$infile'."
if $OCR_IMAGES_TO_PDF && command -v ocrmypdf &> /dev/null; then
log_msg "Attempting OCR with ocrmypdf for image '$infile'."
ocrmypdf "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: Image '$infile' OCR'd via ocrmypdf to '$outfile'."; conversion_done=true; } \
|| { log_msg "ERROR: ocrmypdf FAILED for image '$infile'. Falling back to ImageMagick's convert.";
convert "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: Image '$infile' converted via ImageMagick (no OCR)."; conversion_done=true; } \
|| log_msg "ERROR: ImageMagick's convert also FAILED for '$infile'."; }
else
log_msg "Converting image '$infile' with ImageMagick's convert (no OCR)."
convert "$infile" "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: Image '$infile' converted via ImageMagick."; conversion_done=true; } \
|| log_msg "ERROR: ImageMagick's convert FAILED for '$infile'."
fi
;;
application/x-tex|text/x-tex|application/x-latex)
if [[ "$ext" == "cls" || "$ext" == "sty" ]]; then # LaTeX class/style files
log_msg "INFO: LaTeX Class/Style file '$infile'. Converting as syntax-highlighted text."
pandoc "$infile" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via Pandoc (as LaTeX source)."; conversion_done=true; } \
|| { log_msg "ERROR: Pandoc (LaTeX source) FAILED for '$infile'. Trying enscript.";
enscript "$infile" --font=Courier10 --highlight=latex -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via enscript (highlighted LaTeX)."; conversion_done=true; } \
|| log_msg "ERROR: enscript (highlighted LaTeX) FAILED for '$infile'."; }
else # Regular .tex files
log_msg "INFO: LaTeX document '$infile'. Compiling with pdflatex."
local temp_tex_dir=$(mktemp -d "$out_subdir/tex_compile_XXXXXX")
cp "$infile" "$temp_tex_dir/" # Copy tex file to temp dir
# If there are associated .bib files or images, they'd need to be copied too or paths adjusted.
# This simplified version assumes self-contained .tex or resolvable paths from temp_tex_dir.
local tex_filename_only=$(basename "$infile")
(cd "$temp_tex_dir" && \
pdflatex -interaction=nonstopmode "$tex_filename_only" && \
pdflatex -interaction=nonstopmode "$tex_filename_only") >> "$LOG_FILE" 2>&1
local compiled_pdf_base="${tex_filename_only%.*}"
local compiled_pdf_path="$temp_tex_dir/$compiled_pdf_base.pdf"
if [[ -f "$compiled_pdf_path" ]]; then
mv "$compiled_pdf_path" "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' compiled via pdflatex to '$outfile'."; conversion_done=true; } \
|| log_msg "ERROR: pdflatex compiled '$infile', but FAILED to move to '$outfile'."
else
log_msg "ERROR: pdflatex compilation FAILED for '$infile'. Output PDF not found in '$temp_tex_dir'."
fi
rm -rf "$temp_tex_dir"
fi
;;
application/x-bibtex|text/x-bibtex)
log_msg "INFO: BibTeX file '$infile'. Converting with Pandoc."
pandoc "$infile" --standalone -o "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via Pandoc."; conversion_done=true; } \
|| { log_msg "ERROR: Pandoc (BibTeX) FAILED for '$infile'. Trying enscript.";
enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (BibTeX) converted as plain text via enscript."; conversion_done=true; } \
|| log_msg "ERROR: enscript fallback for BibTeX '$infile' FAILED."; }
;;
application/x-troff-man|text/troff) # Man pages
log_msg "INFO: Man page '$infile'. Converting with 'man'."
man -Tpdf "$infile" > "$outfile" 2>> "$LOG_FILE" # man outputs errors to stderr
if [[ $? -eq 0 && -s "$outfile" ]]; then
log_msg "OK: '$infile' converted via 'man -Tpdf'."
conversion_done=true
else
log_msg "WARN: 'man -Tpdf' FAILED for '$infile'. Trying Pandoc."
pandoc "$infile" --standalone -f man -t pdf -o "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via Pandoc (man)."; conversion_done=true; } \
|| { log_msg "ERROR: Pandoc (man) FAILED for '$infile'. Trying enscript.";
enscript "$infile" --font=Courier10 -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' (man page) converted as text via enscript."; conversion_done=true; } \
|| log_msg "ERROR: enscript fallback for man page '$infile' FAILED."; }
fi
;;
text/x-python|text/x-shellscript|application/x-perl|application/x-ruby|\
text/x-csrc|text/x-chdr|text/x-c++src|text/x-java|text/html|text/css|application/javascript|application/xml|text/xml) # Code, XML, HTML
log_msg "INFO: Code/Markup file '$infile' ($mimetype). Converting with Pandoc (syntax highlighting)."
# Determine Pandoc format based on extension if possible for better highlighting
local pandoc_format_opt=""
case "$ext" in
py) pandoc_format_opt="python" ;;
sh|bash) pandoc_format_opt="bash" ;;
pl) pandoc_format_opt="perl" ;;
rb) pandoc_format_opt="ruby" ;;
c|h) pandoc_format_opt="c" ;;
cpp|hpp|cxx) pandoc_format_opt="cpp" ;;
java) pandoc_format_opt="java" ;;
html|htm) pandoc_format_opt="html" ;;
css) pandoc_format_opt="css" ;;
js) pandoc_format_opt="javascript" ;;
xml|bcf|run.xml) pandoc_format_opt="xml" ;; # .bcf (Biber control file), .run.xml
esac
if [[ -n "$pandoc_format_opt" ]]; then
pandoc "$infile" --from="$pandoc_format_opt" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1
else # Default to Pandoc auto-detection or plain text
pandoc "$infile" --standalone --highlight-style=kate -o "$outfile" >> "$LOG_FILE" 2>&1
fi
if [[ $? -eq 0 && -s "$outfile" ]]; then
log_msg "OK: '$infile' converted via Pandoc with highlighting."
conversion_done=true
else
log_msg "WARN: Pandoc with highlighting FAILED for '$infile'. Trying enscript."
local enscript_hl_opt=""
[[ -n "$ext" ]] && enscript_hl_opt="--highlight=$ext"
enscript "$infile" --font=Courier10 $enscript_hl_opt -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via enscript."; conversion_done=true; } \
|| log_msg "ERROR: enscript fallback for '$infile' FAILED."
fi
;;
text/*) # Generic text files (.log, .txt, .bak, .aux, .synctex, .blg, .info, .lst, .conf etc.)
log_msg "INFO: Generic text file '$infile' ($mimetype, ext: .$ext). Converting with enscript."
enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via enscript."; conversion_done=true; } \
|| log_msg "ERROR: enscript FAILED for '$infile'."
;;
application/octet-stream|application/x-dosexec|application/x-sharedlib|\
application/x-object|application/x-executable|application/x-sqlite3|inode/x-empty|\
application/x-archive|application/zip|application/gzip|application/x-bzip2|application/x-xz)
log_msg "INFO: Binary/Archive/Empty/Unknown file '$infile' (MIME: $mimetype, Desc: $filedesc)."
if [[ "$mimetype" == "inode/x-empty" ]]; then
create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "File is empty."
conversion_done=true
# Heuristic: if 'filedesc' suggests text despite octet-stream, try enscript
elif [[ "$filedesc" == *"text"* || "$filedesc" == *"script"* || "$filedesc" == *"ASCII text"* || "$filedesc" == *"UTF-8 Unicode text"* ]]; then
log_msg "INFO: MIME is '$mimetype', but filedesc suggests text ('$filedesc'). Trying enscript."
enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via enscript (heuristic)."; conversion_done=true; } \
|| { log_msg "ERROR: enscript (heuristic) FAILED for '$infile'. Using binary handling.";
if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Binary or undetermined content type ($mimetype)."; fi
conversion_done=true; # Placeholder PDF is a form of "done"
}
else # Standard binary handling
if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Binary, archive, or undetermined content type ($mimetype)."; fi
conversion_done=true; # Placeholder PDF is a form of "done"
fi
;;
*) # Fallback for truly unrecognized/unhandled MIME types
log_msg "WARN: Unhandled MIME type '$mimetype' for '$infile'. File description: '$filedesc'."
if [[ "$filedesc" == *"text"* || "$filedesc" == *"script"* || "$filedesc" == *"ASCII text"* || "$filedesc" == *"UTF-8 Unicode text"* ]]; then
log_msg "INFO: Unhandled MIME, but filedesc suggests text ('$filedesc'). Trying enscript."
enscript "$infile" --font=Courier10 --word-wrap -p - -o - | ps2pdf - "$outfile" >> "$LOG_FILE" 2>&1 \
&& { log_msg "OK: '$infile' converted via enscript (heuristic for unhandled MIME)."; conversion_done=true; } \
|| { log_msg "ERROR: enscript (heuristic for unhandled MIME) FAILED for '$infile'. Using binary handling.";
if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Unhandled MIME type ($mimetype) and not clearly text."; fi
conversion_done=true;
}
else
log_msg "INFO: Unhandled MIME type '$mimetype' for '$infile'. Using configured binary handling."
if [[ "$binary_handling_method" == "hex" ]]; then create_hexdump_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
elif [[ "$binary_handling_method" == "strings" ]]; then create_strings_pdf "$infile" "$outfile" "$mimetype" "$filedesc";
else create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "Unhandled MIME type ($mimetype)."; fi
conversion_done=true;
fi
;;
esac
if ! $conversion_done && [[ ! -f "$outfile" ]]; then
log_msg "FALLBACK: No conversion method succeeded and no output file created for '$infile'. Creating placeholder."
create_metadata_pdf "$infile" "$outfile" "$mimetype" "$filedesc" "No applicable conversion rule or all attempts failed."
elif ! $conversion_done && [[ -f "$outfile" ]]; then
log_msg "WARN: Conversion logic did not explicitly set 'done' flag, but output file '$outfile' exists. Assuming prior step handled it."
fi
log_msg "END Processing: '$infile'"
[[ -f "$outfile" ]] && return 0 || return 1 # Return 0 if output exists, 1 otherwise
}
# --- Script Main Logic ---
# Argument Parsing
if [[ "$#" -lt 1 || "$#" -gt 4 ]]; then
echo "Usage: $0 <listfile.txt> [common_prefix_to_strip] [binary_handling: metadata|hex|strings] [force_ocr_all_pdfs: true|false]"
echo "Example: $0 myfiles.txt \"/mnt/mydata\" hex true"
exit 1
fi
INPUT_LIST_FILE="$1"
[[ -n "$2" ]] && COMMON_PREFIX_TO_STRIP="$2"
[[ -n "$3" ]] && BINARY_HANDLING_USER_CHOICE="$3" || BINARY_HANDLING_USER_CHOICE="$DEFAULT_BINARY_HANDLING"
[[ -n "$4" ]] && FORCE_OCR_ALL_PDFS_USER_CHOICE="$4"
# Validate binary handling choice
case "$BINARY_HANDLING_USER_CHOICE" in
metadata|hex|strings) BINARY_HANDLING="$BINARY_HANDLING_USER_CHOICE" ;;
*) log_msg "WARN: Invalid binary_handling option '$BINARY_HANDLING_USER_CHOICE'. Defaulting to '$DEFAULT_BINARY_HANDLING'."; BINARY_HANDLING="$DEFAULT_BINARY_HANDLING" ;;
esac
# Validate force OCR choice
if [[ -n "$FORCE_OCR_ALL_PDFS_USER_CHOICE" ]]; then
if [[ "$FORCE_OCR_ALL_PDFS_USER_CHOICE" == "true" ]]; then
FORCE_OCR_ALL_EXISTING_PDFS=true
elif [[ "$FORCE_OCR_ALL_PDFS_USER_CHOICE" == "false" ]]; then
FORCE_OCR_ALL_EXISTING_PDFS=false
else
log_msg "WARN: Invalid force_ocr_all_pdfs option '$FORCE_OCR_ALL_PDFS_USER_CHOICE'. Using default ($FORCE_OCR_ALL_EXISTING_PDFS)."
fi
fi
# Initialize Log File
echo "Conversion process started at $(date)" > "$LOG_FILE"
log_msg "INFO: Input List File: '$INPUT_LIST_FILE'"
log_msg "INFO: Common Prefix to Strip: '$COMMON_PREFIX_TO_STRIP'"
log_msg "INFO: Binary File Handling: '$BINARY_HANDLING'"
log_msg "INFO: Force OCR All Existing PDFs: $FORCE_OCR_ALL_EXISTING_PDFS"
log_msg "INFO: OCR Images to PDF: $OCR_IMAGES_TO_PDF"
log_msg "INFO: Output Base Directory: '$OUTPUT_DIR_BASE'"
check_commands # Check for essential tools
# Prepare clean file list (robust URL decoding)
CLEANED_LIST_FOR_PROCESSING_INTERNAL="cleaned_input_list_internal.tmp"
log_msg "INFO: Preparing clean file list from '$INPUT_LIST_FILE'..."
if python3 -c 'import sys, urllib.parse; [print(urllib.parse.unquote(line.strip())) for line in sys.stdin if line.strip()]' < "$INPUT_LIST_FILE" > "$CLEANED_LIST_FOR_PROCESSING_INTERNAL"; then
log_msg "INFO: Clean file list created successfully using Python."
else
log_msg "WARN: Python URL decoding failed (Python 3 not found or error). Using original list (may have issues with URL-encoded names)."
cp "$INPUT_LIST_FILE" "$CLEANED_LIST_FOR_PROCESSING_INTERNAL" # Fallback
fi
# Create output base directory
mkdir -p "$OUTPUT_DIR_BASE"
log_msg "INFO: Ensured output base directory '$OUTPUT_DIR_BASE' exists."
# Process the list
total_files=0
successful_conversions=0
failed_conversions=0
skipped_up_to_date=0
while IFS= read -r file_to_process || [[ -n "$file_to_process" ]]; do
# Skip empty or comment lines from the (cleaned) list
[[ -z "$file_to_process" || "$file_to_process" =~ ^# ]] && continue
total_files=$((total_files + 1))
if [[ ! -e "$file_to_process" ]]; then # Use -e to check if path exists (file or dir)
log_msg "ERROR: File or directory '$file_to_process' from list NOT FOUND. Skipping."
failed_conversions=$((failed_conversions + 1))
continue
fi
if [[ -d "$file_to_process" ]]; then
log_msg "SKIP: Path '$file_to_process' is a DIRECTORY. Skipping."
# Consider if directories should be counted as skipped or failed. For now, just log.
continue
fi
if [[ ! -f "$file_to_process" ]]; then
log_msg "SKIP: Path '$file_to_process' is NOT A REGULAR FILE. Skipping."
continue
fi
if [[ ! -r "$file_to_process" ]]; then
log_msg "ERROR: File '$file_to_process' is NOT READABLE. Skipping."
failed_conversions=$((failed_conversions + 1))
continue
fi
# The up-to-date check is now inside convert_file, which returns 0 for success/skipped-up-to-date
convert_file "$file_to_process" "$BINARY_HANDLING"
status=$?
# Crude status check based on return value (0 for success/skipped, 1 for failure)
# A more refined check would involve parsing the log for "SKIP" vs "OK"
if [[ $status -eq 0 ]]; then
# This counts files where an output PDF was created OR skipped because it was up-to-date.
# To differentiate, we'd need more complex return codes or log parsing.
# For now, if convert_file returns 0, it means no critical error in its own execution.
# The actual "success" of conversion is in the log.
: # Not incrementing success here, summary is tricky.
else
failed_conversions=$((failed_conversions + 1))
fi
done < "$CLEANED_LIST_FOR_PROCESSING_INTERNAL"
rm -f "$CLEANED_LIST_FOR_PROCESSING_INTERNAL" # Clean up temp list
# Final Summary (approximated from log counts for more detail)
successful_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - OK:" "$LOG_FILE")
error_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - ERROR:" "$LOG_FILE")
skipped_ops=$(grep -cE "^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - SKIP:" "$LOG_FILE") # Includes up-to-date and non-file skips
log_msg "-----------------------------------------------------"
log_msg "Conversion process completed at $(date)"
log_msg "SUMMARY: Total items from list processed: $total_files"
log_msg "SUMMARY: Successful operations (OK): $successful_ops"
log_msg "SUMMARY: Errored operations (ERROR): $error_ops"
log_msg "SUMMARY: Skipped operations (SKIP/NOT FOUND/DIR): $skipped_ops (includes up-to-date, not found, directories)"
log_msg "INFO: Detailed log written to '$LOG_FILE'"
log_msg "INFO: Output PDFs are in subdirectories under '$OUTPUT_DIR_BASE'"
echo "Conversion complete. Log: $LOG_FILE. Output in $OUTPUT_DIR_BASE."
URL: https://ib.bsb.br/convert-everything-to-readable-pdfs