pagenotes splitter: csv2files

Slug: csv2files

3638 characters 392 words

#pagenotes.app .csv to separated .txt files

#!/usr/bin/env bash set -euo pipefail # split_pagenotes_csv.sh # # Input: CSV with 4 quoted fields per record: # "<url>","<timestamp>","<note text possibly with newlines>","<flag>" # # Output: one .txt per record in /home/rangelma/bash-csv/ # - filename derived from field 1 (URL) to match your examples: # * strip http:// or https:// # * split on '/' and join with '>' # * drop empty trailing segments (so trailing '/' doesn't create a trailing '>') # * sanitize unsafe characters to '_' # * append ".txt" # - file content is exactly field 3 (preserving internal newlines) usage() { cat <<'USAGE' Usage: split_pagenotes_csv.sh /path/to/notes.csv Writes files to: /home/rangelma/bash-csv/ USAGE } if [[ ${1:-} == "-h" || ${1:-} == "--help" ]]; then usage exit 0 fi if [[ $# -ne 1 ]]; then echo "Error: expected exactly 1 argument (the .csv file)." >&2 usage >&2 exit 2 fi infile=$1 outdir="/home/rangelma/bash-csv" if [[ ! -f "$infile" ]]; then echo "Error: input file not found: $infile" >&2 exit 2 fi mkdir -p "$outdir" python3 - "$infile" "$outdir" <<'PY' import csv import os import re import sys from urllib.parse import urlsplit infile, outdir = sys.argv[1], sys.argv[2] os.makedirs(outdir, exist_ok=True) def url_to_filename(url: str) -> str: url = url.strip() # Remove scheme to match the examples try: parts = urlsplit(url) if parts.scheme: rest = parts.netloc + parts.path else: rest = re.sub(r'^[A-Za-z][A-Za-z0-9+.-]*://', '', url) except Exception: rest = re.sub(r'^[A-Za-z][A-Za-z0-9+.-]*://', '', url) # Convert path separators to '>' rest = rest.replace("\\", "/") rest = rest.replace("/", ">") # Drop empty trailing segments (e.g., trailing '/') rest = rest.strip(">") # Sanitize: keep letters/digits/._->- ; replace everything else with '_' rest = re.sub(r"[^A-Za-z0-9.\-_>]", "_", rest) if not rest: rest = "untitled" return rest + ".txt" with open(infile, newline="", encoding="utf-8") as f: reader = csv.reader( f, delimiter=",", quotechar='"', doublequote=True, strict=False, ) for row_idx, row in enumerate(reader, start=1): # Expecting 4 fields; tolerate extra fields by using at least the first 3 if not row or len(row) < 3: continue url = row[0] note_text = row[2] filename = url_to_filename(url) path = os.path.join(outdir, filename) # Avoid overwriting if multiple records map to the same filename base, ext = os.path.splitext(path) final_path = path n = 1 while os.path.exists(final_path): final_path = f"{base}__{n}{ext}" n += 1 with open(final_path, "w", encoding="utf-8") as out: out.write(note_text) if note_text and not note_text.endswith("\n"): out.write("\n") PY echo "Done. Wrote files to: $outdir"
URL: https://ib.bsb.br/csv2files