#!/usr/bin/env bash
set -euo pipefail
# split_pagenotes_csv.sh
#
# Input: CSV with 4 quoted fields per record:
# "<url>","<timestamp>","<note text possibly with newlines>","<flag>"
#
# Output: one .txt per record in /home/rangelma/bash-csv/
# - filename derived from field 1 (URL) to match your examples:
# * strip http:// or https://
# * split on '/' and join with '>'
# * drop empty trailing segments (so trailing '/' doesn't create a trailing '>')
# * sanitize unsafe characters to '_'
# * append ".txt"
# - file content is exactly field 3 (preserving internal newlines)
usage() {
cat <<'USAGE'
Usage:
split_pagenotes_csv.sh /path/to/notes.csv
Writes files to:
/home/rangelma/bash-csv/
USAGE
}
if [[ ${1:-} == "-h" || ${1:-} == "--help" ]]; then
usage
exit 0
fi
if [[ $# -ne 1 ]]; then
echo "Error: expected exactly 1 argument (the .csv file)." >&2
usage >&2
exit 2
fi
infile=$1
outdir="/home/rangelma/bash-csv"
if [[ ! -f "$infile" ]]; then
echo "Error: input file not found: $infile" >&2
exit 2
fi
mkdir -p "$outdir"
python3 - "$infile" "$outdir" <<'PY'
import csv
import os
import re
import sys
from urllib.parse import urlsplit
infile, outdir = sys.argv[1], sys.argv[2]
os.makedirs(outdir, exist_ok=True)
def url_to_filename(url: str) -> str:
url = url.strip()
# Remove scheme to match the examples
try:
parts = urlsplit(url)
if parts.scheme:
rest = parts.netloc + parts.path
else:
rest = re.sub(r'^[A-Za-z][A-Za-z0-9+.-]*://', '', url)
except Exception:
rest = re.sub(r'^[A-Za-z][A-Za-z0-9+.-]*://', '', url)
# Convert path separators to '>'
rest = rest.replace("\\", "/")
rest = rest.replace("/", ">")
# Drop empty trailing segments (e.g., trailing '/')
rest = rest.strip(">")
# Sanitize: keep letters/digits/._->- ; replace everything else with '_'
rest = re.sub(r"[^A-Za-z0-9.\-_>]", "_", rest)
if not rest:
rest = "untitled"
return rest + ".txt"
with open(infile, newline="", encoding="utf-8") as f:
reader = csv.reader(
f,
delimiter=",",
quotechar='"',
doublequote=True,
strict=False,
)
for row_idx, row in enumerate(reader, start=1):
# Expecting 4 fields; tolerate extra fields by using at least the first 3
if not row or len(row) < 3:
continue
url = row[0]
note_text = row[2]
filename = url_to_filename(url)
path = os.path.join(outdir, filename)
# Avoid overwriting if multiple records map to the same filename
base, ext = os.path.splitext(path)
final_path = path
n = 1
while os.path.exists(final_path):
final_path = f"{base}__{n}{ext}"
n += 1
with open(final_path, "w", encoding="utf-8") as out:
out.write(note_text)
if note_text and not note_text.endswith("\n"):
out.write("\n")
PY
echo "Done. Wrote files to: $outdir"
URL: https://ib.bsb.br/csv2files