#!/bin/bash
# Function to display help information
display_help() {
echo "Usage: $0 [search_directory] [destination_directory]"
echo "Extracts text from files in the specified search directory, concatenates them,"
echo "and saves the result in the specified destination directory."
echo "If no arguments are provided, default directories are used:"
echo " - Search Directory: \$HOME/Documents"
echo " - Destination Directory: \$HOME/Desktop"
}
# Check if help argument is provided
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
display_help
exit 0
fi
# Define directories and files
search_directory="${1:-$HOME/Documents}"
destination_directory="${2:-$HOME/Desktop}"
final_file="${destination_directory}/final_concatenated_file.txt"
unsupported_file="${destination_directory}/concatenated_contents_unsupported.txt"
temp_file_list="${destination_directory}/temp_file_list.txt"
# Supported file types
supported_types=(
"text/plain" "text/markdown" "text/x-log" "text/x-srt" "text/x-microdvd" "text/csv"
"text/xml" "application/json" "application/mbox" "application/vnd.ms-word"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
"application/vnd.oasis.opendocument.text" "application/vnd.oasis.opendocument.spreadsheet"
"application/vnd.oasis.opendocument.presentation" "application/pdf" "application/rtf"
"text/html" "application/x-python" "application/java-archive" "text/javascript"
"text/x-c++src" "text/x-chdr" "application/x-zip-compressed" "application/x-rar-compressed"
"application/x-tar" "application/gzip" "application/vnd.ms-powerpoint"
"application/vnd.openxmlformats-officedocument.presentationml.presentation" "text/x-tex"
)
# Function to check if output is textual
is_output_textual() {
local file="$1"
local is_text=0
# Check if the file contains null bytes
if grep -q $'\0' "$file"; then
is_text=0
else
# Extract printable strings from the file
local strings_output=$(strings "$file")
local strings_length=${#strings_output}
local file_size=$(stat -c%s "$file")
# Check if the length of extracted strings is significant
if [ $strings_length -ge $((file_size / 2)) ]; then
is_text=1
else
is_text=0
fi
fi
if [[ $is_text -eq 0 ]]; then
# Check if the file MIME type is in the supported types list
local mime_type=$(file -b --mime-type "$file")
for supported_type in "${supported_types[@]}"; do
if [[ "${mime_type}" == "${supported_type}" ]]; then
is_text=1
break
fi
done
fi
if [[ $is_text -eq 0 ]]; then
return 1
else
return 0
fi
}
# Function to process individual file
process_file() {
local file="$1"
local text_file_path="$2"
if [[ -f "$file" ]]; then
case "${file##*.}" in
pdf|docx|html|htm|srt|sub|log|tex|latex|doc|ppt|pptx|zip|rar|tar|gz|odt|ods|odp|rtf|csv|xml|json|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
echo "Processing file: $file"
# Convert file to text and check if conversion is successful
convert_to_text "$file" "$text_file_path"
if [[ $? -ne 0 ]]; then
echo "Error: Failed to convert $file" >&2
return 1
fi
;;
*)
# Unsupported file types are processed using 'cat' command
echo "Warning: Unsupported file type '${file##*.}'. Processing using 'cat' command." >&2
cat "$file" > "$text_file_path" 2>/dev/null
# Initialize a counter for failed checks
failed_checks=0
# Check for null bytes
if grep -q $'\0' "$text_file_path"; then
failed_checks=$((failed_checks + 1))
fi
# Check the significance of extracted printable strings
strings_output=$(strings "$text_file_path")
strings_length=${#strings_output}
file_size=$(stat -c%s "$text_file_path")
if [ $strings_length -lt $((file_size / 2)) ]; then
failed_checks=$((failed_checks + 1))
fi
# Check the file's MIME type
mime_type=$(file -b --mime-type "$text_file_path")
supported_type=0
for type in "${supported_types[@]}"; do
if [[ "${mime_type}" == "${type}" ]]; then
supported_type=1
break
fi
done
if [[ $supported_type -eq 0 ]]; then
failed_checks=$((failed_checks + 1))
fi
# Determine the output file based on the number of failed checks
if [[ $failed_checks -gt 2 ]]; then
echo "Warning: Output of $file failed more than two checks. Appending to unsupported file." >&2
echo "<!-- $file -->:" >> "$unsupported_file"
echo "\`\`\`" >> "$unsupported_file"
cat "$text_file_path" >> "$unsupported_file"
echo "\`\`\`" >> "$unsupported_file"
rm -f "$text_file_path"
else
echo "Output of $file passed the checks. Appending to final file."
echo "<!-- $file -->:" >> "$final_file"
echo "\`\`\`" >> "$final_file"
cat "$text_file_path" >> "$final_file"
echo "\`\`\`" >> "$final_file"
rm -f "$text_file_path"
fi
return 1
;;
esac
if ! is_output_textual "$text_file_path"; then
echo "Warning: Output of $file is not textual. Skipping." >&2
rm -f "$text_file_path"
return 1
fi
else
echo "Warning: '$file' is not a regular file. Skipping." >&2
return 1
fi
echo "File $file processed successfully"
return 0
}
# Function to convert file to text based on its type
convert_to_text() {
local file="$1"
local text_file_path="$2"
case "${file##*.}" in
pdf)
echo "Converting $file using pdftotext..."
pdftotext "$file" "$text_file_path" 2>/dev/null
;;
docx|odt|ods|odp)
echo "Converting $file using pandoc..."
pandoc "$file" -t plain -o "$text_file_path" 2>/dev/null
;;
html|htm)
echo "Converting $file using lynx..."
lynx -dump -nolist -assume_charset UTF-8 -display_charset UTF-8 "$file" > "$text_file_path" 2>/dev/null
;;
srt|sub|log|tex|latex|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
echo "Copying $file..."
cat "$file" > "$text_file_path" 2>/dev/null
;;
doc)
echo "Converting $file using antiword..."
antiword "$file" > "$text_file_path" 2>/dev/null
;;
ppt|pptx)
echo "Converting $file using catppt..."
catppt "$file" > "$text_file_path" 2>/dev/null
;;
zip|rar|tar|gz)
echo "Extracting and processing $file..."
extract_and_process_archive "$file" "$text_file_path"
;;
rtf)
echo "Converting $file using unrtf..."
unrtf --text "$file" > "$text_file_path" 2>/dev/null
;;
csv)
echo "Processing $file using awk..."
awk -F, '{print}' "$file" > "$text_file_path"
;;
xml)
echo "Processing $file using xmllint..."
xmllint --xpath "//text()" "$file" > "$text_file_path" 2>/dev/null
;;
json)
echo "Converting $file using jq..."
jq -r '.' "$file" > "$text_file_path" 2>/dev/null
;;
esac
if [[ $? -ne 0 ]]; then
echo "Error: Failed to convert $file" >&2
return 1
fi
echo "Conversion of $file completed successfully"
return 0
}
# Function to extract and process archive files
extract_and_process_archive() {
local file="$1"
local text_file_path="$2"
local temp_dir=$(mktemp -d)
case "${file##*.}" in
zip)
echo "Extracting $file using unzip..."
unzip -q "$file" -d "$temp_dir"
;;
rar)
echo "Extracting $file using unrar..."
unrar x "$file" "$temp_dir"
;;
tar)
echo "Extracting $file using tar..."
tar -xf "$file" -C "$temp_dir"
;;
gz)
echo "Extracting $file using gunzip..."
gunzip -c "$file" > "$temp_dir/$(basename "$file" .gz)"
;;
esac
if [[ $? -ne 0 ]]; then
echo "Error: Failed to extract archive '$file'" >&2
rm -rf "$temp_dir"
return 1
fi
echo "Extraction of $file completed successfully"
while IFS= read -r -d '' nested_file; do
echo "Processing nested file: $nested_file"
process_file "$nested_file" "$text_file_path"
done < <(find "$temp_dir" -type f -print0)
rm -rf "$temp_dir"
return 0
}
# Function to concatenate files
concatenate_files() {
local text_file_path="$1"
local file="$2"
echo "<!-- $file -->:" >> "$final_file"
echo "\`\`\`" >> "$final_file"
cat "$text_file_path" >> "$final_file"
echo "\`\`\`" >> "$final_file"
if [[ $? -ne 0 ]]; then
echo "Error: Failed to concatenate $text_file_path" >&2
return 1
fi
echo "Concatenation of $text_file_path completed successfully"
return 0
}
# Function to cleanup temporary files
cleanup_temp_files() {
find "$destination_directory" -type f -name "*.txt" ! -name "$(basename "$final_file")" ! -name "$(basename "$unsupported_file")" -delete
echo "Cleanup of temporary files completed successfully"
}
# Main script execution starts here
mkdir -p "$search_directory" "$destination_directory"
> "$final_file"
> "$unsupported_file"
find "$search_directory" -type f > "$temp_file_list"
while IFS= read -r file; do
text_file_name=$(echo "$file" | sed 's/[^a-zA-Z0-9]/_/g')
text_file_path="${destination_directory}/${text_file_name}.txt"
if process_file "$file" "$text_file_path"; then
concatenate_files "$text_file_path" "$file"
fi
done < "$temp_file_list"
cleanup_temp_files
rm -f "$temp_file_list"
echo "Process completed. All files have been processed and concatenated into ${final_file}."
echo "Unsupported file types have been processed using 'cat' command and concatenated into ${unsupported_file}."