cat files contents

1. Paths to Files or Directories
2. .gitignore Files (optional)
3. Jinja2 Template File (optional)
Running the Script
Sample .gitignore
Sample Jinja2 Template

Paths to Files or Directories: The script takes one or more paths to files or directories as arguments. These paths are processed to read and display the contents of the files.
.gitignore Files (optional): If present in the provided directories, these files will be used to determine which files and directories should be ignored based on the rules defined in them.
Jinja2 Template File (optional): If specified with the --template-file option, this file will be used to format the output using Jinja2 templates.

Here is a detailed list of what you need:

#1. Paths to Files or Directories

You need to provide at least one path to a file or directory. These are specified as arguments to the script.

Example:

/path/to/directory1 /path/to/file1.txt

#2. .gitignore Files (optional)

If you want the script to respect .gitignore rules, you should have .gitignore files in the directories you provide. The script will read these files to ignore certain files and directories.

Example:

/path/to/directory1/.gitignore

#3. Jinja2 Template File (optional)

If you want to format the output using a Jinja2 template, you need to provide a template file and specify its path using the --template-file or -t option.

Example:

/path/to/template.jinja2

#Running the Script

To run the script with these inputs, you would use a command like:

python3 your_script.py /path/to/directory1 /path/to/directory2 -t /path/to/template.jinja2

#Sample .gitignore

Here is a sample .gitignore file that you might place in your directories:

# Ignore all .log files
*.log

# Ignore all files in the temp directory
temp/

# Ignore a specific file
ignore_this_file.txt

#Sample Jinja2 Template

Here is a sample Jinja2 template (template.jinja2) that you might use:

File: {{ path }}
Index: {{ index }}
---
{{ content }}
---

  
    
    
     #python script:
        
    
  
      

import os
from fnmatch import fnmatch
import click
from jinja2 import Environment, FileSystemLoader


def should_ignore(path, gitignore_rules):
    for rule in gitignore_rules:
        if fnmatch(os.path.basename(path), rule):
            return True
        if os.path.isdir(path) and fnmatch(os.path.basename(path) + "/", rule):
            return True
    return False


def read_gitignore(path):
    gitignore_path = os.path.join(path, ".gitignore")
    if os.path.isfile(gitignore_path):
        with open(gitignore_path, "r") as f:
            return [
                line.strip() for line in f if line.strip() and not line.startswith("#")
            ]
    return []


def print_from_template(template_file, path, content, index):
    """Renders the content using the provided Jinja2 template."""
    env = Environment(loader=FileSystemLoader(os.path.dirname(template_file)))
    template = env.get_template(os.path.basename(template_file))
    rendered_content = template.render(
        content=content, path=path, index=index
    )  # Pass path and index to the template
    click.echo(rendered_content)


def process_path(
    path,
    include_hidden,
    ignore_gitignore,
    gitignore_rules,
    ignore_patterns,
    template_file,
    index,
):
    if os.path.isfile(path):
        try:
            with open(path, "r") as f:
                content = f.read()
                if template_file:
                    print_from_template(template_file, path, content, index)
                else:
                    click.echo(path)  # Default output: just the path
                    click.echo("---")
                    click.echo(content)
                    click.echo("---")
                index += 1
        except UnicodeDecodeError:
            warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError"
            click.echo(click.style(warning_message, fg="red"), err=True)
    elif os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            if not include_hidden:
                dirs[:] = [d for d in dirs if not d.startswith(".")]
                files = [f for f in files if not f.startswith(".")]
            if not ignore_gitignore:
                gitignore_rules.extend(read_gitignore(root))
            dirs[:] = [
                d
                for d in dirs
                if not should_ignore(os.path.join(root, d), gitignore_rules)
            ]
            files = [
                f
                for f in files
                if not should_ignore(os.path.join(root, f), gitignore_rules)
            ]
            if ignore_patterns:
                files = [
                    f
                    for f in files
                    if not any(fnmatch(f, pattern) for pattern in ignore_patterns)
                ]
            for file in sorted(files):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r") as f:
                        content = f.read()
                        if template_file:
                            print_from_template(
                                template_file, file_path, content, index
                            )
                        else:
                            click.echo(file_path)  # Default output: just the path
                            click.echo("---")
                            click.echo(content)
                            click.echo("---")
                        index += 1
                except UnicodeDecodeError:
                    warning_message = (
                        f"Warning: Skipping file {file_path} due to UnicodeDecodeError"
                    )
                    click.echo(click.style(warning_message, fg="red"), err=True)
    return index


@click.command()
@click.argument("paths", nargs=-1, type=click.Path(exists=True))
@click.option(
    "--include-hidden",
    is_flag=True,
    help="Include files and folders starting with .",
)
@click.option(
    "--ignore-gitignore",
    is_flag=True,
    help="Ignore .gitignore files and include all files",
)
@click.option(
    "ignore_patterns",
    "--ignore",
    multiple=True,
    default=[],
    help="List of patterns to ignore",
)
@click.option(
    "--template-file",
    "-t",
    type=click.Path(exists=True),
    help="Path to a Jinja2 template file for formatting context items.",
)
@click.version_option()
def cli(paths, include_hidden, ignore_gitignore, ignore_patterns, template_file):
    """
    Takes one or more paths to files or directories and outputs every file,
    recursively, each one preceded with its filename like this:
    path/to/file.py
    ----
    Contents of file.py goes here
    ---
    path/to/file2.py
    ---
    ...
    """
    gitignore_rules = []
    for path in paths:
        if not os.path.exists(path):
            raise click.BadArgumentUsage(f"Path does not exist: {path}")
        if not ignore_gitignore:
            gitignore_rules.extend(read_gitignore(os.path.dirname(path)))
    index = 1
    for path in paths:
        index = process_path(
            path,
            include_hidden,
            ignore_gitignore,
            gitignore_rules,
            ignore_patterns,
            template_file,
            index,
        )

  
    
    
     #bash script:
        
    
  
      


#!/bin/bash

# Function to display help information
display_help() {
    echo "Usage: $0 [search_directory] [destination_directory]"
    echo "Extracts text from files in the specified search directory, concatenates them,"
    echo "and saves the result in the specified destination directory."
    echo "If no arguments are provided, default directories are used:"
    echo "  - Search Directory: \$HOME/Documents"
    echo "  - Destination Directory: \$HOME/Desktop"
}

# Check if help argument is provided
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
    display_help
    exit 0
fi

# Define directories and files
search_directory="${1:-$HOME/Documents}"
destination_directory="${2:-$HOME/Desktop}"
final_file="${destination_directory}/final_concatenated_file.txt"
unsupported_file="${destination_directory}/concatenated_contents_unsupported.txt"
temp_file_list="${destination_directory}/temp_file_list.txt"

# Supported file types
supported_types=(
    "text/plain" "text/markdown" "text/x-log" "text/x-srt" "text/x-microdvd" "text/csv"
    "text/xml" "application/json" "application/mbox" "application/vnd.ms-word"
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    "application/vnd.oasis.opendocument.text" "application/vnd.oasis.opendocument.spreadsheet"
    "application/vnd.oasis.opendocument.presentation" "application/pdf" "application/rtf"
    "text/html" "application/x-python" "application/java-archive" "text/javascript"
    "text/x-c++src" "text/x-chdr" "application/x-zip-compressed" "application/x-rar-compressed"
    "application/x-tar" "application/gzip" "application/vnd.ms-powerpoint"
    "application/vnd.openxmlformats-officedocument.presentationml.presentation" "text/x-tex"
)

# Function to check if output is textual
is_output_textual() {
    local file="$1"
    local is_text=0

    # Check if the file contains null bytes
    if grep -q $'\0' "$file"; then
        is_text=0
    else
        # Extract printable strings from the file
        local strings_output=$(strings "$file")
        local strings_length=${#strings_output}
        local file_size=$(stat -c%s "$file")

        # Check if the length of extracted strings is significant
        if [ $strings_length -ge $((file_size / 2)) ]; then
            is_text=1
        else
            is_text=0
        fi
    fi

    if [[ $is_text -eq 0 ]]; then
        # Check if the file MIME type is in the supported types list
        local mime_type=$(file -b --mime-type "$file")
        for supported_type in "${supported_types[@]}"; do
            if [[ "${mime_type}" == "${supported_type}" ]]; then
                is_text=1
                break
            fi
        done
    fi

    if [[ $is_text -eq 0 ]]; then
        return 1
    else
        return 0
    fi
}

# Function to process individual file
process_file() {
    local file="$1"
    local text_file_path="$2"

    if [[ -f "$file" ]]; then
        case "${file##*.}" in
            pdf|docx|html|htm|srt|sub|log|tex|latex|doc|ppt|pptx|zip|rar|tar|gz|odt|ods|odp|rtf|csv|xml|json|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
                echo "Processing file: $file"
                # Convert file to text and check if conversion is successful
                convert_to_text "$file" "$text_file_path"
                if [[ $? -ne 0 ]]; then
                    echo "Error: Failed to convert $file" >&2
                    return 1
                fi
                ;;
            *)
                # Unsupported file types are processed using 'cat' command
                echo "Warning: Unsupported file type '${file##*.}'. Processing using 'cat' command." >&2
                cat "$file" > "$text_file_path" 2>/dev/null
                
                # Initialize a counter for failed checks
                failed_checks=0
                
                # Check for null bytes
                if grep -q $'\0' "$text_file_path"; then
                    failed_checks=$((failed_checks + 1))
                fi
                
                # Check the significance of extracted printable strings
                strings_output=$(strings "$text_file_path")
                strings_length=${#strings_output}
                file_size=$(stat -c%s "$text_file_path")
                if [ $strings_length -lt $((file_size / 2)) ]; then
                    failed_checks=$((failed_checks + 1))
                fi
                
                # Check the file's MIME type
                mime_type=$(file -b --mime-type "$text_file_path")
                supported_type=0
                for type in "${supported_types[@]}"; do
                    if [[ "${mime_type}" == "${type}" ]]; then
                        supported_type=1
                        break
                    fi
                done
                if [[ $supported_type -eq 0 ]]; then
                    failed_checks=$((failed_checks + 1))
                fi
                
                # Determine the output file based on the number of failed checks
                if [[ $failed_checks -gt 2 ]]; then
                    echo "Warning: Output of $file failed more than two checks. Appending to unsupported file." >&2
                    echo "<!-- $file -->:" >> "$unsupported_file"
                    echo "\`\`\`" >> "$unsupported_file"
                    cat "$text_file_path" >> "$unsupported_file"
                    echo "\`\`\`" >> "$unsupported_file"
                    rm -f "$text_file_path"
                else
                    echo "Output of $file passed the checks. Appending to final file."
                    echo "<!-- $file -->:" >> "$final_file"
                    echo "\`\`\`" >> "$final_file"
                    cat "$text_file_path" >> "$final_file"
                    echo "\`\`\`" >> "$final_file"
                    rm -f "$text_file_path"
                fi
                return 1
                ;;
        esac

        if ! is_output_textual "$text_file_path"; then
            echo "Warning: Output of $file is not textual. Skipping." >&2
            rm -f "$text_file_path"
            return 1
        fi
    else
        echo "Warning: '$file' is not a regular file. Skipping." >&2
        return 1
    fi

    echo "File $file processed successfully"
    return 0
}

# Function to convert file to text based on its type
convert_to_text() {
    local file="$1"
    local text_file_path="$2"

    case "${file##*.}" in
        pdf)
            echo "Converting $file using pdftotext..."
            pdftotext "$file" "$text_file_path" 2>/dev/null
            ;;
        docx|odt|ods|odp)
            echo "Converting $file using pandoc..."
            pandoc "$file" -t plain -o "$text_file_path" 2>/dev/null
            ;;
        html|htm)
            echo "Converting $file using lynx..."
            lynx -dump -nolist -assume_charset UTF-8 -display_charset UTF-8 "$file" > "$text_file_path" 2>/dev/null
            ;;
        srt|sub|log|tex|latex|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
            echo "Copying $file..."
            cat "$file" > "$text_file_path" 2>/dev/null
            ;;
        doc)
            echo "Converting $file using antiword..."
            antiword "$file" > "$text_file_path" 2>/dev/null
            ;;
        ppt|pptx)
            echo "Converting $file using catppt..."
            catppt "$file" > "$text_file_path" 2>/dev/null
            ;;
        zip|rar|tar|gz)
            echo "Extracting and processing $file..."
            extract_and_process_archive "$file" "$text_file_path"
            ;;
        rtf)
            echo "Converting $file using unrtf..."
            unrtf --text "$file" > "$text_file_path" 2>/dev/null
            ;;
        csv)
            echo "Processing $file using awk..."
            awk -F, '{print}' "$file" > "$text_file_path"
            ;;
        xml)
            echo "Processing $file using xmllint..."
            xmllint --xpath "//text()" "$file" > "$text_file_path" 2>/dev/null
            ;;
        json)
            echo "Converting $file using jq..."
            jq -r '.' "$file" > "$text_file_path" 2>/dev/null
            ;;
    esac

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to convert $file" >&2
        return 1
    fi

    echo "Conversion of $file completed successfully"
    return 0
}

# Function to extract and process archive files
extract_and_process_archive() {
    local file="$1"
    local text_file_path="$2"
    local temp_dir=$(mktemp -d)

    case "${file##*.}" in
        zip)
            echo "Extracting $file using unzip..."
            unzip -q "$file" -d "$temp_dir"
            ;;
        rar)
            echo "Extracting $file using unrar..."
            unrar x "$file" "$temp_dir"
            ;;
        tar)
            echo "Extracting $file using tar..."
            tar -xf "$file" -C "$temp_dir"
            ;;
        gz)
            echo "Extracting $file using gunzip..."
            gunzip -c "$file" > "$temp_dir/$(basename "$file" .gz)"
            ;;
    esac

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to extract archive '$file'" >&2
        rm -rf "$temp_dir"
        return 1
    fi

    echo "Extraction of $file completed successfully"

    while IFS= read -r -d '' nested_file; do
        echo "Processing nested file: $nested_file"
        process_file "$nested_file" "$text_file_path"
    done < <(find "$temp_dir" -type f -print0)

    rm -rf "$temp_dir"
    return 0
}

# Function to concatenate files
concatenate_files() {
    local text_file_path="$1"
    local file="$2"

    echo "<!-- $file -->:" >> "$final_file"
    echo "\`\`\`" >> "$final_file"
    cat "$text_file_path" >> "$final_file"
    echo "\`\`\`" >> "$final_file"

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to concatenate $text_file_path" >&2
        return 1
    fi

    echo "Concatenation of $text_file_path completed successfully"
    return 0
}

# Function to cleanup temporary files
cleanup_temp_files() {
    find "$destination_directory" -type f -name "*.txt" ! -name "$(basename "$final_file")" ! -name "$(basename "$unsupported_file")" -delete
    echo "Cleanup of temporary files completed successfully"
}

# Main script execution starts here
mkdir -p "$search_directory" "$destination_directory"
> "$final_file"
> "$unsupported_file"
find "$search_directory" -type f > "$temp_file_list"

while IFS= read -r file; do
    text_file_name=$(echo "$file" | sed 's/[^a-zA-Z0-9]/_/g')
    text_file_path="${destination_directory}/${text_file_name}.txt"
    if process_file "$file" "$text_file_path"; then
        concatenate_files "$text_file_path" "$file"
    fi
done < "$temp_file_list"

cleanup_temp_files
rm -f "$temp_file_list"

echo "Process completed. All files have been processed and concatenated into ${final_file}."
echo "Unsupported file types have been processed using 'cat' command and concatenated into ${unsupported_file}."

URL: https://ib.bsb.br/cat-files