cat files contents [script]

#!/bin/bash # Function to display help information display_help() { echo "Usage: $0 [search_directory] [destination_directory]" echo "Extracts text from files in the specified search directory, concatenates them," echo "and saves the result in the specified destination directory." echo "If no arguments are provided, default directories are used:" echo " - Search Directory: \$HOME/Documents" echo " - Destination Directory: \$HOME/Desktop" } # Check if help argument is provided if [[ "$1" == "-h" || "$1" == "--help" ]]; then display_help exit 0 fi # Define directories and files search_directory="${1:-$HOME/Documents}" destination_directory="${2:-$HOME/Desktop}" final_file="${destination_directory}/final_concatenated_file.txt" unsupported_file="${destination_directory}/concatenated_contents_unsupported.txt" temp_file_list="${destination_directory}/temp_file_list.txt" # Supported file types supported_types=( "text/plain" "text/markdown" "text/x-log" "text/x-srt" "text/x-microdvd" "text/csv" "text/xml" "application/json" "application/mbox" "application/" "application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.oasis.opendocument.text" "application/vnd.oasis.opendocument.spreadsheet" "application/vnd.oasis.opendocument.presentation" "application/pdf" "application/rtf" "text/html" "application/x-python" "application/java-archive" "text/javascript" "text/x-c++src" "text/x-chdr" "application/x-zip-compressed" "application/x-rar-compressed" "application/x-tar" "application/gzip" "application/" "application/vnd.openxmlformats-officedocument.presentationml.presentation" "text/x-tex" ) # Function to check if output is textual is_output_textual() { local file="$1" local is_text=0 # Check if the file contains null bytes if grep -q $'\0' "$file"; then is_text=0 else # Extract printable strings from the file local strings_output=$(strings "$file") local strings_length=${#strings_output} local file_size=$(stat -c%s "$file") # Check if the length of extracted strings is significant if [ $strings_length -ge $((file_size / 2)) ]; then is_text=1 else is_text=0 fi fi if [[ $is_text -eq 0 ]]; then # Check if the file MIME type is in the supported types list local mime_type=$(file -b --mime-type "$file") for supported_type in "${supported_types[@]}"; do if [[ "${mime_type}" == "${supported_type}" ]]; then is_text=1 break fi done fi if [[ $is_text -eq 0 ]]; then return 1 else return 0 fi } # Function to process individual file process_file() { local file="$1" local text_file_path="$2" if [[ -f "$file" ]]; then case "${file##*.}" in pdf|docx|html|htm|srt|sub|log|tex|latex|doc|ppt|pptx|zip|rar|tar|gz|odt|ods|odp|rtf|csv|xml|json|py|java|js|cpp|h|md|markdown|txt|eml|mbox) echo "Processing file: $file" # Convert file to text and check if conversion is successful convert_to_text "$file" "$text_file_path" if [[ $? -ne 0 ]]; then echo "Error: Failed to convert $file" >&2 return 1 fi ;; *) # Unsupported file types are processed using 'cat' command echo "Warning: Unsupported file type '${file##*.}'. Processing using 'cat' command." >&2 cat "$file" > "$text_file_path" 2>/dev/null # Initialize a counter for failed checks failed_checks=0 # Check for null bytes if grep -q $'\0' "$text_file_path"; then failed_checks=$((failed_checks + 1)) fi # Check the significance of extracted printable strings strings_output=$(strings "$text_file_path") strings_length=${#strings_output} file_size=$(stat -c%s "$text_file_path") if [ $strings_length -lt $((file_size / 2)) ]; then failed_checks=$((failed_checks + 1)) fi # Check the file's MIME type mime_type=$(file -b --mime-type "$text_file_path") supported_type=0 for type in "${supported_types[@]}"; do if [[ "${mime_type}" == "${type}" ]]; then supported_type=1 break fi done if [[ $supported_type -eq 0 ]]; then failed_checks=$((failed_checks + 1)) fi # Determine the output file based on the number of failed checks if [[ $failed_checks -gt 2 ]]; then echo "Warning: Output of $file failed more than two checks. Appending to unsupported file." >&2 echo "<!-- $file -->:" >> "$unsupported_file" echo "\`\`\`" >> "$unsupported_file" cat "$text_file_path" >> "$unsupported_file" echo "\`\`\`" >> "$unsupported_file" rm -f "$text_file_path" else echo "Output of $file passed the checks. Appending to final file." echo "<!-- $file -->:" >> "$final_file" echo "\`\`\`" >> "$final_file" cat "$text_file_path" >> "$final_file" echo "\`\`\`" >> "$final_file" rm -f "$text_file_path" fi return 1 ;; esac if ! is_output_textual "$text_file_path"; then echo "Warning: Output of $file is not textual. Skipping." >&2 rm -f "$text_file_path" return 1 fi else echo "Warning: '$file' is not a regular file. Skipping." >&2 return 1 fi echo "File $file processed successfully" return 0 } # Function to convert file to text based on its type convert_to_text() { local file="$1" local text_file_path="$2" case "${file##*.}" in pdf) echo "Converting $file using pdftotext..." pdftotext "$file" "$text_file_path" 2>/dev/null ;; docx|odt|ods|odp) echo "Converting $file using pandoc..." pandoc "$file" -t plain -o "$text_file_path" 2>/dev/null ;; html|htm) echo "Converting $file using lynx..." lynx -dump -nolist -assume_charset UTF-8 -display_charset UTF-8 "$file" > "$text_file_path" 2>/dev/null ;; srt|sub|log|tex|latex|py|java|js|cpp|h|md|markdown|txt|eml|mbox) echo "Copying $file..." cat "$file" > "$text_file_path" 2>/dev/null ;; doc) echo "Converting $file using antiword..." antiword "$file" > "$text_file_path" 2>/dev/null ;; ppt|pptx) echo "Converting $file using catppt..." catppt "$file" > "$text_file_path" 2>/dev/null ;; zip|rar|tar|gz) echo "Extracting and processing $file..." extract_and_process_archive "$file" "$text_file_path" ;; rtf) echo "Converting $file using unrtf..." unrtf --text "$file" > "$text_file_path" 2>/dev/null ;; csv) echo "Processing $file using awk..." awk -F, '{print}' "$file" > "$text_file_path" ;; xml) echo "Processing $file using xmllint..." xmllint --xpath "//text()" "$file" > "$text_file_path" 2>/dev/null ;; json) echo "Converting $file using jq..." jq -r '.' "$file" > "$text_file_path" 2>/dev/null ;; esac if [[ $? -ne 0 ]]; then echo "Error: Failed to convert $file" >&2 return 1 fi echo "Conversion of $file completed successfully" return 0 } # Function to extract and process archive files extract_and_process_archive() { local file="$1" local text_file_path="$2" local temp_dir=$(mktemp -d) case "${file##*.}" in zip) echo "Extracting $file using unzip..." unzip -q "$file" -d "$temp_dir" ;; rar) echo "Extracting $file using unrar..." unrar x "$file" "$temp_dir" ;; tar) echo "Extracting $file using tar..." tar -xf "$file" -C "$temp_dir" ;; gz) echo "Extracting $file using gunzip..." gunzip -c "$file" > "$temp_dir/$(basename "$file" .gz)" ;; esac if [[ $? -ne 0 ]]; then echo "Error: Failed to extract archive '$file'" >&2 rm -rf "$temp_dir" return 1 fi echo "Extraction of $file completed successfully" while IFS= read -r -d '' nested_file; do echo "Processing nested file: $nested_file" process_file "$nested_file" "$text_file_path" done < <(find "$temp_dir" -type f -print0) rm -rf "$temp_dir" return 0 } # Function to concatenate files concatenate_files() { local text_file_path="$1" local file="$2" echo "<!-- $file -->:" >> "$final_file" echo "\`\`\`" >> "$final_file" cat "$text_file_path" >> "$final_file" echo "\`\`\`" >> "$final_file" if [[ $? -ne 0 ]]; then echo "Error: Failed to concatenate $text_file_path" >&2 return 1 fi echo "Concatenation of $text_file_path completed successfully" return 0 } # Function to cleanup temporary files cleanup_temp_files() { find "$destination_directory" -type f -name "*.txt" ! -name "$(basename "$final_file")" ! -name "$(basename "$unsupported_file")" -delete echo "Cleanup of temporary files completed successfully" } # Main script execution starts here mkdir -p "$search_directory" "$destination_directory" > "$final_file" > "$unsupported_file" find "$search_directory" -type f > "$temp_file_list" while IFS= read -r file; do text_file_name=$(echo "$file" | sed 's/[^a-zA-Z0-9]/_/g') text_file_path="${destination_directory}/${text_file_name}.txt" if process_file "$file" "$text_file_path"; then concatenate_files "$text_file_path" "$file" fi done < "$temp_file_list" cleanup_temp_files rm -f "$temp_file_list" echo "Process completed. All files have been processed and concatenated into ${final_file}." echo "Unsupported file types have been processed using 'cat' command and concatenated into ${unsupported_file}."