It accepts a local file, a URL, or stdin; it can emit Jekyll front matter; and it uses conservative conversion rules that tend to render well on GitHub Pages (kramdown).

#Key behaviors

Default extraction is --mode main (tries <main>, <article>, common content containers, then a “largest text block” heuristic).
Use --selector when heuristics pick the wrong container.
Tables become Markdown tables only when they’re simple; otherwise they become bullets.

#Install and run

python3 -m pip install --user beautifulsoup4 lxml

#Generic HTML file → Jekyll post

python3 html_to_jekyll_md.py page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta --tags imported,html

#Generic URL → Jekyll page

python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/ --date-from-meta

#When heuristics pick the wrong content container, force it with a selector

python3 html_to_jekyll_md.py page.html --selector "article" -o docs/page.md --layout page

#Practical limits

If the page is JavaScript-rendered (SPA shells), you need a “rendered HTML snapshot” first (browser “Save page as…”, or a headless browser step). This script converts the HTML it is given; it does not run scripts.
Highly visual layouts won’t “round-trip” cleanly. The goal here is readable Markdown, not pixel-perfect reconstruction.
Tables are conservative by design: broken tables are worse than bullet lists on GitHub Pages.

#Optional batch wrapper (bash)

#!/usr/bin/env bash
set -euo pipefail

in="${1:?input (file or URL or '-') required}"
out="${2:-docs/$(basename "${in%.*}").md}"

python3 ./html_to_jekyll_md.py "$in" -o "$out" --layout page --date-from-meta
echo "Wrote: $out"

#python script

  #!/usr/bin/env python3
"""html_to_jekyll_md.py

Universal HTML → Markdown converter with Jekyll front matter.

Goals
- Accept *any* single-page HTML source: local file, URL (http/https), or stdin.
- Emit readable Markdown that works well on GitHub Pages (Jekyll + kramdown).
- Provide an optional HardInfo2 profile (because HardInfo2 uses layout tables).

Install
  python3 -m pip install beautifulsoup4
  (optional, faster/more robust parsing) python3 -m pip install lxml

Examples
  # Local file → Jekyll post
  python3 html_to_jekyll_md.py ./page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta

  # URL → Jekyll page
  python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/

  # stdin → stdout
  cat page.html | python3 html_to_jekyll_md.py -

  # HardInfo2 report (auto-detect, or force)
  python3 html_to_jekyll_md.py hardinfo2_report.html --profile hardinfo2 -o docs/hardinfo2.md

Notes
- This converts the HTML it receives; it does not execute JavaScript.
- “Perfect” HTML→Markdown is not generally achievable. This aims for a clean, readable document.
"""

from __future__ import annotations

import argparse
import re
import sys
import urllib.parse
import urllib.request
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Iterable, List, Optional, Sequence, Tuple


# -----------------------------
# Parsing helpers
# -----------------------------

_WS_RE = re.compile(r"[ \t\r\f\v]+")


def _require_bs4() -> None:
    try:
        import bs4  # noqa: F401
    except Exception:
        print(
            "ERROR: Missing dependency: beautifulsoup4\n"
            "  Install with: python3 -m pip install beautifulsoup4",
            file=sys.stderr,
        )
        raise


def _make_soup(html: str):
    from bs4 import BeautifulSoup

    try:
        return BeautifulSoup(html, "lxml")
    except Exception:
        return BeautifulSoup(html, "html.parser")


def _norm_ws_keep_newlines(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    lines = [_WS_RE.sub(" ", ln).strip() for ln in s.split("\n")]
    out = "\n".join(lines)
    out = re.sub(r"\n{3,}", "\n\n", out)
    return out.strip()


def _yaml_quote(s: str) -> str:
    return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'


def _escape_md_inline(s: str) -> str:
    # Minimal escaping; keep it conservative.
    return s.replace("\\", "\\\\").replace("`", "\\`")


def _escape_table_cell(s: str) -> str:
    s = s.replace("\\", "\\\\")
    s = s.replace("|", "\\|")
    s = s.replace("\n", "<br />")
    return s.strip()


def _collapse_blank_lines(lines: List[str]) -> List[str]:
    out: List[str] = []
    blank = False
    for ln in lines:
        if not ln.strip():
            if not blank:
                out.append("")
            blank = True
        else:
            out.append(ln.rstrip())
            blank = False

    while out and out[0] == "":
        out.pop(0)
    while out and out[-1] == "":
        out.pop()
    return out


def _indent_lines(lines: Iterable[str], spaces: int) -> List[str]:
    pad = " " * spaces
    return [(pad + ln) if ln else "" for ln in lines]


def _inline_or_codeblock(text: str) -> Tuple[str, Optional[List[str]]]:
    text = _norm_ws_keep_newlines(text)
    if not text:
        return "", None

    lines = [ln.rstrip() for ln in text.splitlines() if ln.strip()]
    if len(lines) <= 1:
        return (lines[0] if lines else ""), None

    total = sum(len(ln) for ln in lines)
    if len(lines) <= 3 and total <= 140:
        return " ; ".join(lines), None

    return "", ["```text", *lines, "```"]


# -----------------------------
# Input loading
# -----------------------------


def _read_input(input_arg: str, user_agent: str = "html_to_jekyll_md/1.1") -> Tuple[str, Optional[str]]:
    """Return (html, base_url).

    base_url is used to resolve relative links/images.
    - stdin: None
    - file: None
    - URL: the URL
    """

    if input_arg == "-":
        return sys.stdin.read(), None

    if re.match(r"^https?://", input_arg, flags=re.IGNORECASE):
        req = urllib.request.Request(input_arg, headers={"User-Agent": user_agent})
        with urllib.request.urlopen(req) as resp:
            charset = resp.headers.get_content_charset() or "utf-8"
            data = resp.read()
            try:
                html = data.decode(charset, errors="replace")
            except Exception:
                html = data.decode("utf-8", errors="replace")
        return html, input_arg

    path = Path(input_arg)
    return path.read_text(encoding="utf-8", errors="replace"), None


def _extract_base_href(soup) -> Optional[str]:
    base = soup.find("base")
    if base and base.get("href"):
        return _norm_ws_keep_newlines(base["href"]) or None
    return None


# -----------------------------
# Jekyll front matter
# -----------------------------


@dataclass
class FrontMatter:
    layout: str = "post"
    title: str = "Document"
    date: Optional[str] = None
    categories: Optional[List[str]] = None
    tags: Optional[List[str]] = None
    permalink: Optional[str] = None

    def render(self) -> str:
        lines = ["---", f"layout: {self.layout}", f"title: {_yaml_quote(self.title)}"]
        if self.date:
            lines.append(f"date: {_yaml_quote(self.date)}")
        if self.categories:
            lines.append("categories:")
            for c in self.categories:
                lines.append(f"  - {c}")
        if self.tags:
            lines.append("tags:")
            for t in self.tags:
                lines.append(f"  - {t}")
        if self.permalink:
            lines.append(f"permalink: {self.permalink}")
        lines.append("---")
        return "\n".join(lines) + "\n\n"


def _extract_title(soup, override: Optional[str] = None) -> str:
    if override:
        return override

    t = soup.find("title")
    if t and t.get_text(strip=True):
        return _norm_ws_keep_newlines(t.get_text())

    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        return _norm_ws_keep_newlines(h1.get_text())

    return "Document"


def _extract_date_hint(soup) -> Optional[str]:
    meta_props = [
        ("property", "article:published_time"),
        ("name", "date"),
        ("name", "publish-date"),
        ("name", "pubdate"),
        ("itemprop", "datePublished"),
    ]

    for attr, key in meta_props:
        m = soup.find("meta", attrs={attr: key})
        if m and m.get("content"):
            return _norm_ws_keep_newlines(m["content"])

    tm = soup.find("time")
    if tm and tm.get("datetime"):
        return _norm_ws_keep_newlines(tm["datetime"])

    return None


def _now_with_tz() -> str:
    return datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z")


# -----------------------------
# Content selection
# -----------------------------


def _strip_non_content(soup) -> None:
    # Keep it conservative; do not drop “head” here.
    for sel in ["script", "style", "noscript", "template", "iframe"]:
        for tag in soup.select(sel):
            tag.decompose()


def _maybe_strip_chrome(root) -> None:
    for sel in ["nav", "header", "footer", "aside"]:
        for tag in root.find_all(sel):
            tag.decompose()


def _text_len(tag) -> int:
    return len(tag.get_text(" ", strip=True))


def _score_candidate(tag) -> int:
    text = tag.get_text(" ", strip=True)
    if not text:
        return 0
    text_len = len(text)
    link_text = " ".join(a.get_text(" ", strip=True) for a in tag.find_all("a"))
    return max(0, text_len - int(0.7 * len(link_text)))


def _select_root(soup, mode: str, selector: Optional[str]):
    body = soup.body or soup

    if selector:
        found = soup.select_one(selector)
        return found or body

    if mode == "body":
        return body

    # mode == "main"
    for tagname in ["main", "article"]:
        t = body.find(tagname)
        if t and _text_len(t) > 50:
            _maybe_strip_chrome(t)
            return t

    for css in ["#content", "#main", ".content", ".main", ".post", ".article", ".entry-content"]:
        t = body.select_one(css)
        if t and _text_len(t) > 80:
            _maybe_strip_chrome(t)
            return t

    candidates = body.find_all(["div", "section", "article", "main"], limit=250)
    best = body
    best_score = _score_candidate(body)
    for c in candidates:
        sc = _score_candidate(c)
        if sc > best_score:
            best, best_score = c, sc

    _maybe_strip_chrome(best)
    return best


# -----------------------------
# HardInfo2 profile (optional)
# -----------------------------


def _looks_like_hardinfo2(soup) -> bool:
    # HardInfo2 exports layout tables with these classes.
    return bool(soup.find("td", class_="stitle") or soup.find("h1", class_="title"))


def _hardinfo2_text(tag) -> str:
    # HardInfo2 uses <br> and <tt> blocks; preserve newlines.
    return _norm_ws_keep_newlines(tag.get_text("\n", strip=True))


def _hardinfo2_iter_rows(table):
    tbody = table.find("tbody", recursive=False)
    container = tbody if tbody else table

    for tr in container.find_all("tr", recursive=False):
        tds = tr.find_all("td", recursive=False)
        if not tds:
            continue

        if any("stitle" in (td.get("class") or []) for td in tds):
            continue

        sst = tr.find("td", class_="sstitle")
        if sst:
            yield ("sstitle", _hardinfo2_text(sst))
            continue

        nested_tables = tr.find_all("table")
        if nested_tables and len(tds) == 1 and tds[0].get("colspan"):
            yield ("details", nested_tables)
            continue

        # Drop icon columns
        data = []
        for td in tds:
            cls = td.get("class") or []
            if "icon" in cls or "icon_subtitle" in cls:
                continue
            data.append(td)
        if not data:
            continue

        if "field" in (data[0].get("class") or []):
            field = _hardinfo2_text(data[0])
            vals = [_hardinfo2_text(td) for td in data[1:]]
            yield ("kv", (field, vals, tr.find_all("table") or None))
        else:
            yield ("raw", [_hardinfo2_text(td) for td in data])


def _hardinfo2_render_table(table, heading_level: int) -> List[str]:
    lines: List[str] = []

    st = table.find("td", class_="stitle")
    title = _hardinfo2_text(st) if st else None
    if title:
        lines.append("#" * heading_level + f" {_escape_md_inline(title)}")
        lines.append("")

    def render_details(details_table, indent=2) -> List[str]:
        out: List[str] = []
        for kind, payload in _hardinfo2_iter_rows(details_table):
            if kind == "sstitle":
                out.append(" " * indent + f"- **{_escape_md_inline(payload)}**")
                continue

            if kind == "details":
                for t in payload:
                    out.extend(render_details(t, indent=indent + 2))
                continue

            if kind == "kv":
                field, vals, nested = payload
                val_text = " | ".join(v for v in vals if v)
                inline, code = _inline_or_codeblock(val_text)
                if inline:
                    out.append(" " * indent + f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}")
                else:
                    out.append(" " * indent + f"- **{_escape_md_inline(field)}:**")
                    out.extend(_indent_lines(code or [], indent + 4))
                if nested:
                    for t in nested:
                        out.extend(render_details(t, indent=indent + 2))
                continue

            if kind == "raw":
                txt = " | ".join(payload)
                inline, code = _inline_or_codeblock(txt)
                if inline:
                    out.append(" " * indent + f"- {_escape_md_inline(inline)}")
                else:
                    out.append(" " * indent + "-")
                    out.extend(_indent_lines(code or [], indent + 4))

        return out

    for kind, payload in _hardinfo2_iter_rows(table):
        if kind == "sstitle":
            lines.append("#" * (heading_level + 1) + f" {_escape_md_inline(payload)}")
            lines.append("")
            continue

        if kind == "details":
            lines.append("- **Details:**")
            for t in payload:
                lines.extend(render_details(t, indent=2))
            lines.append("")
            continue

        if kind == "kv":
            field, vals, nested = payload
            val_text = " | ".join(v for v in vals if v)
            inline, code = _inline_or_codeblock(val_text)
            if inline:
                lines.append(f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}")
            else:
                lines.append(f"- **{_escape_md_inline(field)}:**")
                lines.extend(_indent_lines(code or [], 4))
            if nested:
                for t in nested:
                    lines.extend(render_details(t, indent=2))
            continue

        if kind == "raw":
            txt = " | ".join(payload)
            inline, code = _inline_or_codeblock(txt)
            if inline:
                lines.append(f"- {_escape_md_inline(inline)}")
            else:
                lines.append("-")
                lines.extend(_indent_lines(code or [], 4))

    lines.append("")
    return lines


def _hardinfo2_convert(html: str, section_heading_level: int = 2) -> str:
    soup = _make_soup(html)
    root = soup.body or soup

    out: List[str] = []
    for el in root.find_all(["h1", "table"], recursive=True):
        if el.name == "h1" and "title" in (el.get("class") or []):
            out.append("#" * section_heading_level + f" {_escape_md_inline(_hardinfo2_text(el))}")
            out.append("")
        elif el.name == "table" and el.find_parent("table") is None:
            out.extend(_hardinfo2_render_table(el, heading_level=section_heading_level + 1))

    return "\n".join(_collapse_blank_lines(out)) + "\n"


# -----------------------------
# Generic HTML → Markdown
# -----------------------------


BLOCK_TAGS = {
    "p",
    "div",
    "section",
    "article",
    "main",
    "pre",
    "blockquote",
    "ul",
    "ol",
    "table",
    "hr",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
}


class _Ctx:
    def __init__(
        self,
        base_url: Optional[str],
        keep_images: bool,
        heading_offset: int,
        max_heading_level: int,
    ):
        self.base_url = base_url
        self.keep_images = keep_images
        self.heading_offset = heading_offset
        self.max_heading_level = max_heading_level


def _resolve_url(href: str, base_url: Optional[str]) -> str:
    if not href:
        return ""
    return urllib.parse.urljoin(base_url, href) if base_url else href


def _extract_text(tag) -> str:
    return _norm_ws_keep_newlines(tag.get_text(" ", strip=True))


def _convert_children(tag, ctx: _Ctx) -> List[str]:
    from bs4 import NavigableString, Tag

    out: List[str] = []
    for child in tag.children:
        if isinstance(child, NavigableString):
            txt = _norm_ws_keep_newlines(str(child))
            if txt:
                out.append(txt)
            continue
        if isinstance(child, Tag):
            out.extend(_convert_node(child, ctx))
    return out


def _convert_children_inline(tag, ctx: _Ctx) -> List[str]:
    from bs4 import NavigableString, Tag

    out: List[str] = []
    for child in tag.children:
        if isinstance(child, NavigableString):
            txt = _norm_ws_keep_newlines(str(child))
            if txt:
                out.append(txt)
            continue
        if isinstance(child, Tag):
            out.append(_convert_inline(child, ctx))
    return out


def _convert_inline(tag, ctx: _Ctx) -> str:
    name = tag.name.lower()

    if name in ("span", "label", "small", "sup", "sub"):
        return "".join(_convert_children_inline(tag, ctx))

    if name in ("strong", "b"):
        inner = "".join(_convert_children_inline(tag, ctx)).strip()
        return f"**{inner}**" if inner else ""

    if name in ("em", "i"):
        inner = "".join(_convert_children_inline(tag, ctx)).strip()
        return f"*{inner}*" if inner else ""

    if name == "code":
        inner = _extract_text(tag)
        if "`" in inner:
            return f"``{inner}``"
        return f"`{inner}`"

    if name == "br":
        return "  \n"

    if name == "a":
        href = _resolve_url(tag.get("href", ""), ctx.base_url)
        text = "".join(_convert_children_inline(tag, ctx)).strip() or href
        return f"[{text}]({href})" if href else text

    if name == "img":
        if not ctx.keep_images:
            return ""
        alt = (tag.get("alt") or "image").strip() or "image"
        src = _resolve_url(tag.get("src", ""), ctx.base_url)
        return f"![{_escape_md_inline(alt)}]({src})" if src else ""

    return "".join(_convert_children_inline(tag, ctx))


def _table_to_bullets(table, ctx: _Ctx) -> List[str]:
    lines: List[str] = []
    tbody = table.find("tbody", recursive=False)
    container = tbody if tbody else table

    for tr in container.find_all("tr", recursive=False):
        cells = tr.find_all(["th", "td"], recursive=False)
        if not cells:
            continue
        vals = [_extract_text(td) for td in cells]
        vals = [v for v in vals if v]
        if not vals:
            continue
        txt = " | ".join(vals)
        inline, code = _inline_or_codeblock(txt)
        if inline:
            lines.append(f"- {_escape_md_inline(inline)}")
        else:
            lines.append("-")
            lines.extend(_indent_lines(code or [], 4))

    lines.append("")
    return lines


def _table_to_md(table, ctx: _Ctx) -> List[str]:
    if table.find("table"):
        return _table_to_bullets(table, ctx)

    rows: List[List[str]] = []
    tbody = table.find("tbody", recursive=False)
    container = tbody if tbody else table

    for tr in container.find_all("tr", recursive=False):
        cells = tr.find_all(["th", "td"], recursive=False)
        if not cells:
            continue
        rows.append([_extract_text(td) for td in cells])

    if not rows:
        return []

    has_th = bool(table.find("th"))
    header = rows[0] if has_th else None
    body_rows = rows[1:] if has_th else rows

    if header is None:
        first = rows[0]
        if first and all(len(c) <= 40 for c in first) and len(first) <= 6:
            header = first
            body_rows = rows[1:]
        else:
            header = [f"Col {i+1}" for i in range(len(rows[0]))]
            body_rows = rows

    col_count = max(len(header), *(len(r) for r in body_rows))

    def pad(r: List[str]) -> List[str]:
        return (r + [""] * col_count)[:col_count]

    header = pad(header)

    lines: List[str] = []
    lines.append("| " + " | ".join(_escape_table_cell(c) for c in header) + " |")
    lines.append("|" + "|".join(["---"] * col_count) + "|")

    for r in body_rows:
        r = pad(r)
        if any("\n" in c for c in r):
            return _table_to_bullets(table, ctx)
        lines.append("| " + " | ".join(_escape_table_cell(c) for c in r) + " |")

    lines.append("")
    return lines


def _list_to_md(list_tag, ordered: bool, ctx: _Ctx, indent: int = 0) -> List[str]:
    from bs4 import Tag

    lines: List[str] = []
    items = [li for li in list_tag.find_all("li", recursive=False) if isinstance(li, Tag)]

    for idx, li in enumerate(items, start=1):
        prefix = f"{idx}. " if ordered else "- "
        cont_indent = indent + len(prefix)

        # Build “main line” from inline-ish children and the first paragraph-ish block.
        main_fragments: List[str] = []
        extra_blocks: List[List[str]] = []

        for child in li.children:
            if isinstance(child, Tag) and child.name.lower() in ("ul", "ol"):
                continue

            if isinstance(child, Tag) and child.name.lower() in BLOCK_TAGS:
                block_lines = _convert_node(child, ctx)
                block_lines = _collapse_blank_lines(block_lines)
                if block_lines:
                    if not main_fragments:
                        # Use first non-empty line as the main line.
                        first_line = next((x for x in block_lines if x.strip()), "")
                        if first_line:
                            main_fragments.append(first_line)
                        rest = [x for x in block_lines if x.strip()][1:]
                        if rest:
                            extra_blocks.append(rest)
                    else:
                        extra_blocks.append([x for x in block_lines if x.strip()])
                continue

            if isinstance(child, Tag):
                main_fragments.append(_convert_inline(child, ctx))
            else:
                txt = _norm_ws_keep_newlines(str(child))
                if txt:
                    main_fragments.append(txt)

        main_line = _norm_ws_keep_newlines("".join(main_fragments)).strip()
        lines.append(" " * indent + prefix + main_line)

        for block in extra_blocks:
            for ln in block:
                lines.append(" " * cont_indent + ln)

        for child in li.find_all(["ul", "ol"], recursive=False):
            lines.extend(_list_to_md(child, ordered=(child.name.lower() == "ol"), ctx=ctx, indent=indent + 2))

    lines.append("")
    return lines


def _convert_node(tag, ctx: _Ctx) -> List[str]:
    name = tag.name.lower()

    if name in ("script", "style", "noscript", "template"):
        return []

    if name in ("h1", "h2", "h3", "h4", "h5", "h6"):
        lvl = int(name[1]) + ctx.heading_offset
        lvl = max(1, min(ctx.max_heading_level, lvl))
        text = "".join(_convert_children_inline(tag, ctx)).strip()
        text = _norm_ws_keep_newlines(text)
        return ["#" * lvl + f" {_escape_md_inline(text)}", ""] if text else []

    if name == "p":
        text = "".join(_convert_children_inline(tag, ctx)).strip()
        text = _norm_ws_keep_newlines(text)
        return [text, ""] if text else []

    if name == "hr":
        return ["---", ""]

    if name == "blockquote":
        inner = _collapse_blank_lines(_convert_children(tag, ctx))
        quoted: List[str] = []
        for ln in inner:
            quoted.append(">" if not ln.strip() else "> " + ln)
        quoted.append("")
        return quoted

    if name == "pre":
        # Try to infer a language from pre/code classes.
        lang = ""
        cls = " ".join(tag.get("class") or [])
        m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls)
        if not m:
            code = tag.find("code")
            if code:
                cls2 = " ".join(code.get("class") or [])
                m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls2)
        if m:
            lang = m.group(1)

        code_text = tag.get_text("\n", strip=False).replace("\r\n", "\n").replace("\r", "\n")
        lines = code_text.splitlines()
        while lines and not lines[0].strip():
            lines.pop(0)
        while lines and not lines[-1].strip():
            lines.pop()
        return [f"```{lang}".rstrip(), *lines, "```", ""]

    if name == "ul":
        return _list_to_md(tag, ordered=False, ctx=ctx)

    if name == "ol":
        return _list_to_md(tag, ordered=True, ctx=ctx)

    if name == "table":
        return _table_to_md(tag, ctx)

    if name == "img":
        inline = _convert_inline(tag, ctx)
        return [inline, ""] if inline else []

    if name in ("div", "section", "article", "main", "body"):
        return _convert_children(tag, ctx)

    inline = _convert_inline(tag, ctx).strip()
    return [inline] if inline else []


def _generic_convert(
    html: str,
    *,
    base_url: Optional[str],
    mode: str,
    selector: Optional[str],
    keep_images: bool,
    heading_offset: int,
    max_heading_level: int,
) -> str:
    soup = _make_soup(html)
    _strip_non_content(soup)

    root = _select_root(soup, mode=mode, selector=selector)
    ctx = _Ctx(
        base_url=base_url,
        keep_images=keep_images,
        heading_offset=heading_offset,
        max_heading_level=max_heading_level,
    )

    lines = _convert_children(root, ctx)
    lines = _collapse_blank_lines(lines)
    return "\n".join(lines) + "\n"


def convert_html_to_markdown(
    html: str,
    *,
    base_url: Optional[str] = None,
    mode: str = "main",  # main|body
    selector: Optional[str] = None,
    keep_images: bool = False,
    heading_offset: int = 0,
    max_heading_level: int = 6,
    profile: str = "auto",  # auto|generic|hardinfo2
    hardinfo_section_level: int = 2,
) -> str:
    soup = _make_soup(html)
    _strip_non_content(soup)

    if profile == "hardinfo2" or (profile == "auto" and _looks_like_hardinfo2(soup)):
        return _hardinfo2_convert(html, section_heading_level=max(1, hardinfo_section_level))

    return _generic_convert(
        html,
        base_url=base_url,
        mode=mode,
        selector=selector,
        keep_images=keep_images,
        heading_offset=heading_offset,
        max_heading_level=max(1, max_heading_level),
    )


# -----------------------------
# CLI
# -----------------------------


def main(argv: Optional[Sequence[str]] = None) -> int:
    _require_bs4()

    p = argparse.ArgumentParser(description="Convert HTML (file/URL/stdin) to Jekyll-friendly Markdown")
    p.add_argument("input", help="Path, URL, or '-' for stdin")
    p.add_argument("-o", "--output", type=Path, default=None, help="Output .md file (default: stdout)")

    # Front matter
    p.add_argument("--no-front-matter", action="store_true", help="Do not emit YAML front matter")
    p.add_argument("--layout", default="post", help="Front matter layout (default: post)")
    p.add_argument("--title", default=None, help="Front matter title override")
    p.add_argument("--date", default=None, help="Front matter date override")
    p.add_argument("--date-from-meta", action="store_true", help="Try to extract date from meta/time tags")
    p.add_argument("--categories", default=None, help="Comma-separated categories")
    p.add_argument("--tags", default=None, help="Comma-separated tags")
    p.add_argument("--permalink", default=None, help="Front matter permalink")

    # Conversion knobs
    p.add_argument("--mode", choices=["main", "body"], default="main", help="Content selection mode")
    p.add_argument("--selector", default=None, help="CSS selector to choose content root (overrides --mode)")
    p.add_argument("--base-url", default=None, help="Base URL for resolving relative links/images")
    p.add_argument("--keep-images", action="store_true", help="Include <img> as Markdown images")
    p.add_argument("--heading-offset", type=int, default=0, help="Add to heading levels (e.g., 1 makes h1→h2)")
    p.add_argument("--max-heading-level", type=int, default=6, help="Clamp max heading depth (default: 6)")

    # Profiles
    p.add_argument(
        "--profile",
        choices=["auto", "generic", "hardinfo2"],
        default="auto",
        help="auto detects HardInfo2; otherwise force generic/hardinfo2",
    )
    p.add_argument(
        "--hardinfo-section-level",
        type=int,
        default=2,
        help="Heading level for HardInfo2 section titles (default: 2)",
    )

    args = p.parse_args(argv)

    html, inferred_base = _read_input(args.input)
    soup = _make_soup(html)

    # Base URL: CLI > <base href> > inferred (URL input)
    base_url = args.base_url or _extract_base_href(soup) or inferred_base

    fm = None
    if not args.no_front_matter:
        title = _extract_title(soup, args.title)
        date = args.date
        if not date and args.date_from_meta:
            date = _extract_date_hint(soup)
        if not date:
            date = _now_with_tz()

        fm = FrontMatter(
            layout=args.layout,
            title=title,
            date=date,
            categories=[c.strip() for c in (args.categories or "").split(",") if c.strip()] or None,
            tags=[t.strip() for t in (args.tags or "").split(",") if t.strip()] or None,
            permalink=args.permalink,
        )

    md_body = convert_html_to_markdown(
        html,
        base_url=base_url,
        mode=args.mode,
        selector=args.selector,
        keep_images=args.keep_images,
        heading_offset=args.heading_offset,
        max_heading_level=args.max_heading_level,
        profile=args.profile,
        hardinfo_section_level=args.hardinfo_section_level,
    )

    out_text = (fm.render() if fm else "") + md_body

    if args.output:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        args.output.write_text(out_text, encoding="utf-8")
    else:
        sys.stdout.write(out_text)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())

URL: https://ib.bsb.br/html2md