It accepts a local file, a URL, or stdin; it can emit Jekyll front matter; and it uses conservative conversion rules that tend to render well on GitHub Pages (kramdown).
#Key behaviors
- Default extraction is
--mode main(tries<main>,<article>, common content containers, then a “largest text block” heuristic). - Use
--selectorwhen heuristics pick the wrong container. - Tables become Markdown tables only when they’re simple; otherwise they become bullets.
#Install and run
python3 -m pip install --user beautifulsoup4 lxml
#Generic HTML file → Jekyll post
python3 html_to_jekyll_md.py page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta --tags imported,html
#Generic URL → Jekyll page
python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/ --date-from-meta
#When heuristics pick the wrong content container, force it with a selector
python3 html_to_jekyll_md.py page.html --selector "article" -o docs/page.md --layout page
#Practical limits
- If the page is JavaScript-rendered (SPA shells), you need a “rendered HTML snapshot” first (browser “Save page as…”, or a headless browser step). This script converts the HTML it is given; it does not run scripts.
- Highly visual layouts won’t “round-trip” cleanly. The goal here is readable Markdown, not pixel-perfect reconstruction.
- Tables are conservative by design: broken tables are worse than bullet lists on GitHub Pages.
#Optional batch wrapper (bash)
#!/usr/bin/env bash
set -euo pipefail
in="${1:?input (file or URL or '-') required}"
out="${2:-docs/$(basename "${in%.*}").md}"
python3 ./html_to_jekyll_md.py "$in" -o "$out" --layout page --date-from-meta
echo "Wrote: $out"
#python script
#!/usr/bin/env python3
"""html_to_jekyll_md.py
Universal HTML → Markdown converter with Jekyll front matter.
Goals
- Accept *any* single-page HTML source: local file, URL (http/https), or stdin.
- Emit readable Markdown that works well on GitHub Pages (Jekyll + kramdown).
- Provide an optional HardInfo2 profile (because HardInfo2 uses layout tables).
Install
python3 -m pip install beautifulsoup4
(optional, faster/more robust parsing) python3 -m pip install lxml
Examples
# Local file → Jekyll post
python3 html_to_jekyll_md.py ./page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta
# URL → Jekyll page
python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/
# stdin → stdout
cat page.html | python3 html_to_jekyll_md.py -
# HardInfo2 report (auto-detect, or force)
python3 html_to_jekyll_md.py hardinfo2_report.html --profile hardinfo2 -o docs/hardinfo2.md
Notes
- This converts the HTML it receives; it does not execute JavaScript.
- “Perfect” HTML→Markdown is not generally achievable. This aims for a clean, readable document.
"""
from __future__ import annotations
import argparse
import re
import sys
import urllib.parse
import urllib.request
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Iterable, List, Optional, Sequence, Tuple
# -----------------------------
# Parsing helpers
# -----------------------------
_WS_RE = re.compile(r"[ \t\r\f\v]+")
def _require_bs4() -> None:
try:
import bs4 # noqa: F401
except Exception:
print(
"ERROR: Missing dependency: beautifulsoup4\n"
" Install with: python3 -m pip install beautifulsoup4",
file=sys.stderr,
)
raise
def _make_soup(html: str):
from bs4 import BeautifulSoup
try:
return BeautifulSoup(html, "lxml")
except Exception:
return BeautifulSoup(html, "html.parser")
def _norm_ws_keep_newlines(s: str) -> str:
s = s.replace("\u00a0", " ")
s = s.replace("\r\n", "\n").replace("\r", "\n")
lines = [_WS_RE.sub(" ", ln).strip() for ln in s.split("\n")]
out = "\n".join(lines)
out = re.sub(r"\n{3,}", "\n\n", out)
return out.strip()
def _yaml_quote(s: str) -> str:
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'
def _escape_md_inline(s: str) -> str:
# Minimal escaping; keep it conservative.
return s.replace("\\", "\\\\").replace("`", "\\`")
def _escape_table_cell(s: str) -> str:
s = s.replace("\\", "\\\\")
s = s.replace("|", "\\|")
s = s.replace("\n", "<br />")
return s.strip()
def _collapse_blank_lines(lines: List[str]) -> List[str]:
out: List[str] = []
blank = False
for ln in lines:
if not ln.strip():
if not blank:
out.append("")
blank = True
else:
out.append(ln.rstrip())
blank = False
while out and out[0] == "":
out.pop(0)
while out and out[-1] == "":
out.pop()
return out
def _indent_lines(lines: Iterable[str], spaces: int) -> List[str]:
pad = " " * spaces
return [(pad + ln) if ln else "" for ln in lines]
def _inline_or_codeblock(text: str) -> Tuple[str, Optional[List[str]]]:
text = _norm_ws_keep_newlines(text)
if not text:
return "", None
lines = [ln.rstrip() for ln in text.splitlines() if ln.strip()]
if len(lines) <= 1:
return (lines[0] if lines else ""), None
total = sum(len(ln) for ln in lines)
if len(lines) <= 3 and total <= 140:
return " ; ".join(lines), None
return "", ["```text", *lines, "```"]
# -----------------------------
# Input loading
# -----------------------------
def _read_input(input_arg: str, user_agent: str = "html_to_jekyll_md/1.1") -> Tuple[str, Optional[str]]:
"""Return (html, base_url).
base_url is used to resolve relative links/images.
- stdin: None
- file: None
- URL: the URL
"""
if input_arg == "-":
return sys.stdin.read(), None
if re.match(r"^https?://", input_arg, flags=re.IGNORECASE):
req = urllib.request.Request(input_arg, headers={"User-Agent": user_agent})
with urllib.request.urlopen(req) as resp:
charset = resp.headers.get_content_charset() or "utf-8"
data = resp.read()
try:
html = data.decode(charset, errors="replace")
except Exception:
html = data.decode("utf-8", errors="replace")
return html, input_arg
path = Path(input_arg)
return path.read_text(encoding="utf-8", errors="replace"), None
def _extract_base_href(soup) -> Optional[str]:
base = soup.find("base")
if base and base.get("href"):
return _norm_ws_keep_newlines(base["href"]) or None
return None
# -----------------------------
# Jekyll front matter
# -----------------------------
@dataclass
class FrontMatter:
layout: str = "post"
title: str = "Document"
date: Optional[str] = None
categories: Optional[List[str]] = None
tags: Optional[List[str]] = None
permalink: Optional[str] = None
def render(self) -> str:
lines = ["---", f"layout: {self.layout}", f"title: {_yaml_quote(self.title)}"]
if self.date:
lines.append(f"date: {_yaml_quote(self.date)}")
if self.categories:
lines.append("categories:")
for c in self.categories:
lines.append(f" - {c}")
if self.tags:
lines.append("tags:")
for t in self.tags:
lines.append(f" - {t}")
if self.permalink:
lines.append(f"permalink: {self.permalink}")
lines.append("---")
return "\n".join(lines) + "\n\n"
def _extract_title(soup, override: Optional[str] = None) -> str:
if override:
return override
t = soup.find("title")
if t and t.get_text(strip=True):
return _norm_ws_keep_newlines(t.get_text())
h1 = soup.find("h1")
if h1 and h1.get_text(strip=True):
return _norm_ws_keep_newlines(h1.get_text())
return "Document"
def _extract_date_hint(soup) -> Optional[str]:
meta_props = [
("property", "article:published_time"),
("name", "date"),
("name", "publish-date"),
("name", "pubdate"),
("itemprop", "datePublished"),
]
for attr, key in meta_props:
m = soup.find("meta", attrs={attr: key})
if m and m.get("content"):
return _norm_ws_keep_newlines(m["content"])
tm = soup.find("time")
if tm and tm.get("datetime"):
return _norm_ws_keep_newlines(tm["datetime"])
return None
def _now_with_tz() -> str:
return datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")
# -----------------------------
# Content selection
# -----------------------------
def _strip_non_content(soup) -> None:
# Keep it conservative; do not drop “head” here.
for sel in ["script", "style", "noscript", "template", "iframe"]:
for tag in soup.select(sel):
tag.decompose()
def _maybe_strip_chrome(root) -> None:
for sel in ["nav", "header", "footer", "aside"]:
for tag in root.find_all(sel):
tag.decompose()
def _text_len(tag) -> int:
return len(tag.get_text(" ", strip=True))
def _score_candidate(tag) -> int:
text = tag.get_text(" ", strip=True)
if not text:
return 0
text_len = len(text)
link_text = " ".join(a.get_text(" ", strip=True) for a in tag.find_all("a"))
return max(0, text_len - int(0.7 * len(link_text)))
def _select_root(soup, mode: str, selector: Optional[str]):
body = soup.body or soup
if selector:
found = soup.select_one(selector)
return found or body
if mode == "body":
return body
# mode == "main"
for tagname in ["main", "article"]:
t = body.find(tagname)
if t and _text_len(t) > 50:
_maybe_strip_chrome(t)
return t
for css in ["#content", "#main", ".content", ".main", ".post", ".article", ".entry-content"]:
t = body.select_one(css)
if t and _text_len(t) > 80:
_maybe_strip_chrome(t)
return t
candidates = body.find_all(["div", "section", "article", "main"], limit=250)
best = body
best_score = _score_candidate(body)
for c in candidates:
sc = _score_candidate(c)
if sc > best_score:
best, best_score = c, sc
_maybe_strip_chrome(best)
return best
# -----------------------------
# HardInfo2 profile (optional)
# -----------------------------
def _looks_like_hardinfo2(soup) -> bool:
# HardInfo2 exports layout tables with these classes.
return bool(soup.find("td", class_="stitle") or soup.find("h1", class_="title"))
def _hardinfo2_text(tag) -> str:
# HardInfo2 uses <br> and <tt> blocks; preserve newlines.
return _norm_ws_keep_newlines(tag.get_text("\n", strip=True))
def _hardinfo2_iter_rows(table):
tbody = table.find("tbody", recursive=False)
container = tbody if tbody else table
for tr in container.find_all("tr", recursive=False):
tds = tr.find_all("td", recursive=False)
if not tds:
continue
if any("stitle" in (td.get("class") or []) for td in tds):
continue
sst = tr.find("td", class_="sstitle")
if sst:
yield ("sstitle", _hardinfo2_text(sst))
continue
nested_tables = tr.find_all("table")
if nested_tables and len(tds) == 1 and tds[0].get("colspan"):
yield ("details", nested_tables)
continue
# Drop icon columns
data = []
for td in tds:
cls = td.get("class") or []
if "icon" in cls or "icon_subtitle" in cls:
continue
data.append(td)
if not data:
continue
if "field" in (data[0].get("class") or []):
field = _hardinfo2_text(data[0])
vals = [_hardinfo2_text(td) for td in data[1:]]
yield ("kv", (field, vals, tr.find_all("table") or None))
else:
yield ("raw", [_hardinfo2_text(td) for td in data])
def _hardinfo2_render_table(table, heading_level: int) -> List[str]:
lines: List[str] = []
st = table.find("td", class_="stitle")
title = _hardinfo2_text(st) if st else None
if title:
lines.append("#" * heading_level + f" {_escape_md_inline(title)}")
lines.append("")
def render_details(details_table, indent=2) -> List[str]:
out: List[str] = []
for kind, payload in _hardinfo2_iter_rows(details_table):
if kind == "sstitle":
out.append(" " * indent + f"- **{_escape_md_inline(payload)}**")
continue
if kind == "details":
for t in payload:
out.extend(render_details(t, indent=indent + 2))
continue
if kind == "kv":
field, vals, nested = payload
val_text = " | ".join(v for v in vals if v)
inline, code = _inline_or_codeblock(val_text)
if inline:
out.append(" " * indent + f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}")
else:
out.append(" " * indent + f"- **{_escape_md_inline(field)}:**")
out.extend(_indent_lines(code or [], indent + 4))
if nested:
for t in nested:
out.extend(render_details(t, indent=indent + 2))
continue
if kind == "raw":
txt = " | ".join(payload)
inline, code = _inline_or_codeblock(txt)
if inline:
out.append(" " * indent + f"- {_escape_md_inline(inline)}")
else:
out.append(" " * indent + "-")
out.extend(_indent_lines(code or [], indent + 4))
return out
for kind, payload in _hardinfo2_iter_rows(table):
if kind == "sstitle":
lines.append("#" * (heading_level + 1) + f" {_escape_md_inline(payload)}")
lines.append("")
continue
if kind == "details":
lines.append("- **Details:**")
for t in payload:
lines.extend(render_details(t, indent=2))
lines.append("")
continue
if kind == "kv":
field, vals, nested = payload
val_text = " | ".join(v for v in vals if v)
inline, code = _inline_or_codeblock(val_text)
if inline:
lines.append(f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}")
else:
lines.append(f"- **{_escape_md_inline(field)}:**")
lines.extend(_indent_lines(code or [], 4))
if nested:
for t in nested:
lines.extend(render_details(t, indent=2))
continue
if kind == "raw":
txt = " | ".join(payload)
inline, code = _inline_or_codeblock(txt)
if inline:
lines.append(f"- {_escape_md_inline(inline)}")
else:
lines.append("-")
lines.extend(_indent_lines(code or [], 4))
lines.append("")
return lines
def _hardinfo2_convert(html: str, section_heading_level: int = 2) -> str:
soup = _make_soup(html)
root = soup.body or soup
out: List[str] = []
for el in root.find_all(["h1", "table"], recursive=True):
if el.name == "h1" and "title" in (el.get("class") or []):
out.append("#" * section_heading_level + f" {_escape_md_inline(_hardinfo2_text(el))}")
out.append("")
elif el.name == "table" and el.find_parent("table") is None:
out.extend(_hardinfo2_render_table(el, heading_level=section_heading_level + 1))
return "\n".join(_collapse_blank_lines(out)) + "\n"
# -----------------------------
# Generic HTML → Markdown
# -----------------------------
BLOCK_TAGS = {
"p",
"div",
"section",
"article",
"main",
"pre",
"blockquote",
"ul",
"ol",
"table",
"hr",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
}
class _Ctx:
def __init__(
self,
base_url: Optional[str],
keep_images: bool,
heading_offset: int,
max_heading_level: int,
):
self.base_url = base_url
self.keep_images = keep_images
self.heading_offset = heading_offset
self.max_heading_level = max_heading_level
def _resolve_url(href: str, base_url: Optional[str]) -> str:
if not href:
return ""
return urllib.parse.urljoin(base_url, href) if base_url else href
def _extract_text(tag) -> str:
return _norm_ws_keep_newlines(tag.get_text(" ", strip=True))
def _convert_children(tag, ctx: _Ctx) -> List[str]:
from bs4 import NavigableString, Tag
out: List[str] = []
for child in tag.children:
if isinstance(child, NavigableString):
txt = _norm_ws_keep_newlines(str(child))
if txt:
out.append(txt)
continue
if isinstance(child, Tag):
out.extend(_convert_node(child, ctx))
return out
def _convert_children_inline(tag, ctx: _Ctx) -> List[str]:
from bs4 import NavigableString, Tag
out: List[str] = []
for child in tag.children:
if isinstance(child, NavigableString):
txt = _norm_ws_keep_newlines(str(child))
if txt:
out.append(txt)
continue
if isinstance(child, Tag):
out.append(_convert_inline(child, ctx))
return out
def _convert_inline(tag, ctx: _Ctx) -> str:
name = tag.name.lower()
if name in ("span", "label", "small", "sup", "sub"):
return "".join(_convert_children_inline(tag, ctx))
if name in ("strong", "b"):
inner = "".join(_convert_children_inline(tag, ctx)).strip()
return f"**{inner}**" if inner else ""
if name in ("em", "i"):
inner = "".join(_convert_children_inline(tag, ctx)).strip()
return f"*{inner}*" if inner else ""
if name == "code":
inner = _extract_text(tag)
if "`" in inner:
return f"``{inner}``"
return f"`{inner}`"
if name == "br":
return " \n"
if name == "a":
href = _resolve_url(tag.get("href", ""), ctx.base_url)
text = "".join(_convert_children_inline(tag, ctx)).strip() or href
return f"[{text}]({href})" if href else text
if name == "img":
if not ctx.keep_images:
return ""
alt = (tag.get("alt") or "image").strip() or "image"
src = _resolve_url(tag.get("src", ""), ctx.base_url)
return f"" if src else ""
return "".join(_convert_children_inline(tag, ctx))
def _table_to_bullets(table, ctx: _Ctx) -> List[str]:
lines: List[str] = []
tbody = table.find("tbody", recursive=False)
container = tbody if tbody else table
for tr in container.find_all("tr", recursive=False):
cells = tr.find_all(["th", "td"], recursive=False)
if not cells:
continue
vals = [_extract_text(td) for td in cells]
vals = [v for v in vals if v]
if not vals:
continue
txt = " | ".join(vals)
inline, code = _inline_or_codeblock(txt)
if inline:
lines.append(f"- {_escape_md_inline(inline)}")
else:
lines.append("-")
lines.extend(_indent_lines(code or [], 4))
lines.append("")
return lines
def _table_to_md(table, ctx: _Ctx) -> List[str]:
if table.find("table"):
return _table_to_bullets(table, ctx)
rows: List[List[str]] = []
tbody = table.find("tbody", recursive=False)
container = tbody if tbody else table
for tr in container.find_all("tr", recursive=False):
cells = tr.find_all(["th", "td"], recursive=False)
if not cells:
continue
rows.append([_extract_text(td) for td in cells])
if not rows:
return []
has_th = bool(table.find("th"))
header = rows[0] if has_th else None
body_rows = rows[1:] if has_th else rows
if header is None:
first = rows[0]
if first and all(len(c) <= 40 for c in first) and len(first) <= 6:
header = first
body_rows = rows[1:]
else:
header = [f"Col {i+1}" for i in range(len(rows[0]))]
body_rows = rows
col_count = max(len(header), *(len(r) for r in body_rows))
def pad(r: List[str]) -> List[str]:
return (r + [""] * col_count)[:col_count]
header = pad(header)
lines: List[str] = []
lines.append("| " + " | ".join(_escape_table_cell(c) for c in header) + " |")
lines.append("|" + "|".join(["---"] * col_count) + "|")
for r in body_rows:
r = pad(r)
if any("\n" in c for c in r):
return _table_to_bullets(table, ctx)
lines.append("| " + " | ".join(_escape_table_cell(c) for c in r) + " |")
lines.append("")
return lines
def _list_to_md(list_tag, ordered: bool, ctx: _Ctx, indent: int = 0) -> List[str]:
from bs4 import Tag
lines: List[str] = []
items = [li for li in list_tag.find_all("li", recursive=False) if isinstance(li, Tag)]
for idx, li in enumerate(items, start=1):
prefix = f"{idx}. " if ordered else "- "
cont_indent = indent + len(prefix)
# Build “main line” from inline-ish children and the first paragraph-ish block.
main_fragments: List[str] = []
extra_blocks: List[List[str]] = []
for child in li.children:
if isinstance(child, Tag) and child.name.lower() in ("ul", "ol"):
continue
if isinstance(child, Tag) and child.name.lower() in BLOCK_TAGS:
block_lines = _convert_node(child, ctx)
block_lines = _collapse_blank_lines(block_lines)
if block_lines:
if not main_fragments:
# Use first non-empty line as the main line.
first_line = next((x for x in block_lines if x.strip()), "")
if first_line:
main_fragments.append(first_line)
rest = [x for x in block_lines if x.strip()][1:]
if rest:
extra_blocks.append(rest)
else:
extra_blocks.append([x for x in block_lines if x.strip()])
continue
if isinstance(child, Tag):
main_fragments.append(_convert_inline(child, ctx))
else:
txt = _norm_ws_keep_newlines(str(child))
if txt:
main_fragments.append(txt)
main_line = _norm_ws_keep_newlines("".join(main_fragments)).strip()
lines.append(" " * indent + prefix + main_line)
for block in extra_blocks:
for ln in block:
lines.append(" " * cont_indent + ln)
for child in li.find_all(["ul", "ol"], recursive=False):
lines.extend(_list_to_md(child, ordered=(child.name.lower() == "ol"), ctx=ctx, indent=indent + 2))
lines.append("")
return lines
def _convert_node(tag, ctx: _Ctx) -> List[str]:
name = tag.name.lower()
if name in ("script", "style", "noscript", "template"):
return []
if name in ("h1", "h2", "h3", "h4", "h5", "h6"):
lvl = int(name[1]) + ctx.heading_offset
lvl = max(1, min(ctx.max_heading_level, lvl))
text = "".join(_convert_children_inline(tag, ctx)).strip()
text = _norm_ws_keep_newlines(text)
return ["#" * lvl + f" {_escape_md_inline(text)}", ""] if text else []
if name == "p":
text = "".join(_convert_children_inline(tag, ctx)).strip()
text = _norm_ws_keep_newlines(text)
return [text, ""] if text else []
if name == "hr":
return ["---", ""]
if name == "blockquote":
inner = _collapse_blank_lines(_convert_children(tag, ctx))
quoted: List[str] = []
for ln in inner:
quoted.append(">" if not ln.strip() else "> " + ln)
quoted.append("")
return quoted
if name == "pre":
# Try to infer a language from pre/code classes.
lang = ""
cls = " ".join(tag.get("class") or [])
m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls)
if not m:
code = tag.find("code")
if code:
cls2 = " ".join(code.get("class") or [])
m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls2)
if m:
lang = m.group(1)
code_text = tag.get_text("\n", strip=False).replace("\r\n", "\n").replace("\r", "\n")
lines = code_text.splitlines()
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
return [f"```{lang}".rstrip(), *lines, "```", ""]
if name == "ul":
return _list_to_md(tag, ordered=False, ctx=ctx)
if name == "ol":
return _list_to_md(tag, ordered=True, ctx=ctx)
if name == "table":
return _table_to_md(tag, ctx)
if name == "img":
inline = _convert_inline(tag, ctx)
return [inline, ""] if inline else []
if name in ("div", "section", "article", "main", "body"):
return _convert_children(tag, ctx)
inline = _convert_inline(tag, ctx).strip()
return [inline] if inline else []
def _generic_convert(
html: str,
*,
base_url: Optional[str],
mode: str,
selector: Optional[str],
keep_images: bool,
heading_offset: int,
max_heading_level: int,
) -> str:
soup = _make_soup(html)
_strip_non_content(soup)
root = _select_root(soup, mode=mode, selector=selector)
ctx = _Ctx(
base_url=base_url,
keep_images=keep_images,
heading_offset=heading_offset,
max_heading_level=max_heading_level,
)
lines = _convert_children(root, ctx)
lines = _collapse_blank_lines(lines)
return "\n".join(lines) + "\n"
def convert_html_to_markdown(
html: str,
*,
base_url: Optional[str] = None,
mode: str = "main", # main|body
selector: Optional[str] = None,
keep_images: bool = False,
heading_offset: int = 0,
max_heading_level: int = 6,
profile: str = "auto", # auto|generic|hardinfo2
hardinfo_section_level: int = 2,
) -> str:
soup = _make_soup(html)
_strip_non_content(soup)
if profile == "hardinfo2" or (profile == "auto" and _looks_like_hardinfo2(soup)):
return _hardinfo2_convert(html, section_heading_level=max(1, hardinfo_section_level))
return _generic_convert(
html,
base_url=base_url,
mode=mode,
selector=selector,
keep_images=keep_images,
heading_offset=heading_offset,
max_heading_level=max(1, max_heading_level),
)
# -----------------------------
# CLI
# -----------------------------
def main(argv: Optional[Sequence[str]] = None) -> int:
_require_bs4()
p = argparse.ArgumentParser(description="Convert HTML (file/URL/stdin) to Jekyll-friendly Markdown")
p.add_argument("input", help="Path, URL, or '-' for stdin")
p.add_argument("-o", "--output", type=Path, default=None, help="Output .md file (default: stdout)")
# Front matter
p.add_argument("--no-front-matter", action="store_true", help="Do not emit YAML front matter")
p.add_argument("--layout", default="post", help="Front matter layout (default: post)")
p.add_argument("--title", default=None, help="Front matter title override")
p.add_argument("--date", default=None, help="Front matter date override")
p.add_argument("--date-from-meta", action="store_true", help="Try to extract date from meta/time tags")
p.add_argument("--categories", default=None, help="Comma-separated categories")
p.add_argument("--tags", default=None, help="Comma-separated tags")
p.add_argument("--permalink", default=None, help="Front matter permalink")
# Conversion knobs
p.add_argument("--mode", choices=["main", "body"], default="main", help="Content selection mode")
p.add_argument("--selector", default=None, help="CSS selector to choose content root (overrides --mode)")
p.add_argument("--base-url", default=None, help="Base URL for resolving relative links/images")
p.add_argument("--keep-images", action="store_true", help="Include <img> as Markdown images")
p.add_argument("--heading-offset", type=int, default=0, help="Add to heading levels (e.g., 1 makes h1→h2)")
p.add_argument("--max-heading-level", type=int, default=6, help="Clamp max heading depth (default: 6)")
# Profiles
p.add_argument(
"--profile",
choices=["auto", "generic", "hardinfo2"],
default="auto",
help="auto detects HardInfo2; otherwise force generic/hardinfo2",
)
p.add_argument(
"--hardinfo-section-level",
type=int,
default=2,
help="Heading level for HardInfo2 section titles (default: 2)",
)
args = p.parse_args(argv)
html, inferred_base = _read_input(args.input)
soup = _make_soup(html)
# Base URL: CLI > <base href> > inferred (URL input)
base_url = args.base_url or _extract_base_href(soup) or inferred_base
fm = None
if not args.no_front_matter:
title = _extract_title(soup, args.title)
date = args.date
if not date and args.date_from_meta:
date = _extract_date_hint(soup)
if not date:
date = _now_with_tz()
fm = FrontMatter(
layout=args.layout,
title=title,
date=date,
categories=[c.strip() for c in (args.categories or "").split(",") if c.strip()] or None,
tags=[t.strip() for t in (args.tags or "").split(",") if t.strip()] or None,
permalink=args.permalink,
)
md_body = convert_html_to_markdown(
html,
base_url=base_url,
mode=args.mode,
selector=args.selector,
keep_images=args.keep_images,
heading_offset=args.heading_offset,
max_heading_level=args.max_heading_level,
profile=args.profile,
hardinfo_section_level=args.hardinfo_section_level,
)
out_text = (fm.render() if fm else "") + md_body
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(out_text, encoding="utf-8")
else:
sys.stdout.write(out_text)
return 0
if __name__ == "__main__":
raise SystemExit(main())