html to markdown

Slug: html2md

35129 characters 3032 words

It accepts a local file, a URL, or stdin; it can emit Jekyll front matter; and it uses conservative conversion rules that tend to render well on GitHub Pages (kramdown).

#Key behaviors

  • Default extraction is --mode main (tries <main>, <article>, common content containers, then a “largest text block” heuristic).
  • Use --selector when heuristics pick the wrong container.
  • Tables become Markdown tables only when they’re simple; otherwise they become bullets.

#Install and run

python3 -m pip install --user beautifulsoup4 lxml

#Generic HTML file → Jekyll post

python3 html_to_jekyll_md.py page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta --tags imported,html

#Generic URL → Jekyll page

python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/ --date-from-meta

#When heuristics pick the wrong content container, force it with a selector

python3 html_to_jekyll_md.py page.html --selector "article" -o docs/page.md --layout page

#Practical limits

  • If the page is JavaScript-rendered (SPA shells), you need a “rendered HTML snapshot” first (browser “Save page as…”, or a headless browser step). This script converts the HTML it is given; it does not run scripts.
  • Highly visual layouts won’t “round-trip” cleanly. The goal here is readable Markdown, not pixel-perfect reconstruction.
  • Tables are conservative by design: broken tables are worse than bullet lists on GitHub Pages.

#Optional batch wrapper (bash)

#!/usr/bin/env bash set -euo pipefail in="${1:?input (file or URL or '-') required}" out="${2:-docs/$(basename "${in%.*}").md}" python3 ./html_to_jekyll_md.py "$in" -o "$out" --layout page --date-from-meta echo "Wrote: $out"

#python script

#!/usr/bin/env python3 """html_to_jekyll_md.py Universal HTML → Markdown converter with Jekyll front matter. Goals - Accept *any* single-page HTML source: local file, URL (http/https), or stdin. - Emit readable Markdown that works well on GitHub Pages (Jekyll + kramdown). - Provide an optional HardInfo2 profile (because HardInfo2 uses layout tables). Install python3 -m pip install beautifulsoup4 (optional, faster/more robust parsing) python3 -m pip install lxml Examples # Local file → Jekyll post python3 html_to_jekyll_md.py ./page.html -o _posts/2026-01-09-page.md --layout post --date-from-meta # URL → Jekyll page python3 html_to_jekyll_md.py https://example.com/article -o docs/article.md --layout page --permalink /article/ # stdin → stdout cat page.html | python3 html_to_jekyll_md.py - # HardInfo2 report (auto-detect, or force) python3 html_to_jekyll_md.py hardinfo2_report.html --profile hardinfo2 -o docs/hardinfo2.md Notes - This converts the HTML it receives; it does not execute JavaScript. - “Perfect” HTML→Markdown is not generally achievable. This aims for a clean, readable document. """ from __future__ import annotations import argparse import re import sys import urllib.parse import urllib.request from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Iterable, List, Optional, Sequence, Tuple # ----------------------------- # Parsing helpers # ----------------------------- _WS_RE = re.compile(r"[ \t\r\f\v]+") def _require_bs4() -> None: try: import bs4 # noqa: F401 except Exception: print( "ERROR: Missing dependency: beautifulsoup4\n" " Install with: python3 -m pip install beautifulsoup4", file=sys.stderr, ) raise def _make_soup(html: str): from bs4 import BeautifulSoup try: return BeautifulSoup(html, "lxml") except Exception: return BeautifulSoup(html, "html.parser") def _norm_ws_keep_newlines(s: str) -> str: s = s.replace("\u00a0", " ") s = s.replace("\r\n", "\n").replace("\r", "\n") lines = [_WS_RE.sub(" ", ln).strip() for ln in s.split("\n")] out = "\n".join(lines) out = re.sub(r"\n{3,}", "\n\n", out) return out.strip() def _yaml_quote(s: str) -> str: return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"' def _escape_md_inline(s: str) -> str: # Minimal escaping; keep it conservative. return s.replace("\\", "\\\\").replace("`", "\\`") def _escape_table_cell(s: str) -> str: s = s.replace("\\", "\\\\") s = s.replace("|", "\\|") s = s.replace("\n", "<br />") return s.strip() def _collapse_blank_lines(lines: List[str]) -> List[str]: out: List[str] = [] blank = False for ln in lines: if not ln.strip(): if not blank: out.append("") blank = True else: out.append(ln.rstrip()) blank = False while out and out[0] == "": out.pop(0) while out and out[-1] == "": out.pop() return out def _indent_lines(lines: Iterable[str], spaces: int) -> List[str]: pad = " " * spaces return [(pad + ln) if ln else "" for ln in lines] def _inline_or_codeblock(text: str) -> Tuple[str, Optional[List[str]]]: text = _norm_ws_keep_newlines(text) if not text: return "", None lines = [ln.rstrip() for ln in text.splitlines() if ln.strip()] if len(lines) <= 1: return (lines[0] if lines else ""), None total = sum(len(ln) for ln in lines) if len(lines) <= 3 and total <= 140: return " ; ".join(lines), None return "", ["```text", *lines, "```"] # ----------------------------- # Input loading # ----------------------------- def _read_input(input_arg: str, user_agent: str = "html_to_jekyll_md/1.1") -> Tuple[str, Optional[str]]: """Return (html, base_url). base_url is used to resolve relative links/images. - stdin: None - file: None - URL: the URL """ if input_arg == "-": return sys.stdin.read(), None if re.match(r"^https?://", input_arg, flags=re.IGNORECASE): req = urllib.request.Request(input_arg, headers={"User-Agent": user_agent}) with urllib.request.urlopen(req) as resp: charset = resp.headers.get_content_charset() or "utf-8" data = resp.read() try: html = data.decode(charset, errors="replace") except Exception: html = data.decode("utf-8", errors="replace") return html, input_arg path = Path(input_arg) return path.read_text(encoding="utf-8", errors="replace"), None def _extract_base_href(soup) -> Optional[str]: base = soup.find("base") if base and base.get("href"): return _norm_ws_keep_newlines(base["href"]) or None return None # ----------------------------- # Jekyll front matter # ----------------------------- @dataclass class FrontMatter: layout: str = "post" title: str = "Document" date: Optional[str] = None categories: Optional[List[str]] = None tags: Optional[List[str]] = None permalink: Optional[str] = None def render(self) -> str: lines = ["---", f"layout: {self.layout}", f"title: {_yaml_quote(self.title)}"] if self.date: lines.append(f"date: {_yaml_quote(self.date)}") if self.categories: lines.append("categories:") for c in self.categories: lines.append(f" - {c}") if self.tags: lines.append("tags:") for t in self.tags: lines.append(f" - {t}") if self.permalink: lines.append(f"permalink: {self.permalink}") lines.append("---") return "\n".join(lines) + "\n\n" def _extract_title(soup, override: Optional[str] = None) -> str: if override: return override t = soup.find("title") if t and t.get_text(strip=True): return _norm_ws_keep_newlines(t.get_text()) h1 = soup.find("h1") if h1 and h1.get_text(strip=True): return _norm_ws_keep_newlines(h1.get_text()) return "Document" def _extract_date_hint(soup) -> Optional[str]: meta_props = [ ("property", "article:published_time"), ("name", "date"), ("name", "publish-date"), ("name", "pubdate"), ("itemprop", "datePublished"), ] for attr, key in meta_props: m = soup.find("meta", attrs={attr: key}) if m and m.get("content"): return _norm_ws_keep_newlines(m["content"]) tm = soup.find("time") if tm and tm.get("datetime"): return _norm_ws_keep_newlines(tm["datetime"]) return None def _now_with_tz() -> str: return datetime.now().astimezone().strftime("%Y-%m-%d %H:%M:%S %z") # ----------------------------- # Content selection # ----------------------------- def _strip_non_content(soup) -> None: # Keep it conservative; do not drop “head” here. for sel in ["script", "style", "noscript", "template", "iframe"]: for tag in soup.select(sel): tag.decompose() def _maybe_strip_chrome(root) -> None: for sel in ["nav", "header", "footer", "aside"]: for tag in root.find_all(sel): tag.decompose() def _text_len(tag) -> int: return len(tag.get_text(" ", strip=True)) def _score_candidate(tag) -> int: text = tag.get_text(" ", strip=True) if not text: return 0 text_len = len(text) link_text = " ".join(a.get_text(" ", strip=True) for a in tag.find_all("a")) return max(0, text_len - int(0.7 * len(link_text))) def _select_root(soup, mode: str, selector: Optional[str]): body = soup.body or soup if selector: found = soup.select_one(selector) return found or body if mode == "body": return body # mode == "main" for tagname in ["main", "article"]: t = body.find(tagname) if t and _text_len(t) > 50: _maybe_strip_chrome(t) return t for css in ["#content", "#main", ".content", ".main", ".post", ".article", ".entry-content"]: t = body.select_one(css) if t and _text_len(t) > 80: _maybe_strip_chrome(t) return t candidates = body.find_all(["div", "section", "article", "main"], limit=250) best = body best_score = _score_candidate(body) for c in candidates: sc = _score_candidate(c) if sc > best_score: best, best_score = c, sc _maybe_strip_chrome(best) return best # ----------------------------- # HardInfo2 profile (optional) # ----------------------------- def _looks_like_hardinfo2(soup) -> bool: # HardInfo2 exports layout tables with these classes. return bool(soup.find("td", class_="stitle") or soup.find("h1", class_="title")) def _hardinfo2_text(tag) -> str: # HardInfo2 uses <br> and <tt> blocks; preserve newlines. return _norm_ws_keep_newlines(tag.get_text("\n", strip=True)) def _hardinfo2_iter_rows(table): tbody = table.find("tbody", recursive=False) container = tbody if tbody else table for tr in container.find_all("tr", recursive=False): tds = tr.find_all("td", recursive=False) if not tds: continue if any("stitle" in (td.get("class") or []) for td in tds): continue sst = tr.find("td", class_="sstitle") if sst: yield ("sstitle", _hardinfo2_text(sst)) continue nested_tables = tr.find_all("table") if nested_tables and len(tds) == 1 and tds[0].get("colspan"): yield ("details", nested_tables) continue # Drop icon columns data = [] for td in tds: cls = td.get("class") or [] if "icon" in cls or "icon_subtitle" in cls: continue data.append(td) if not data: continue if "field" in (data[0].get("class") or []): field = _hardinfo2_text(data[0]) vals = [_hardinfo2_text(td) for td in data[1:]] yield ("kv", (field, vals, tr.find_all("table") or None)) else: yield ("raw", [_hardinfo2_text(td) for td in data]) def _hardinfo2_render_table(table, heading_level: int) -> List[str]: lines: List[str] = [] st = table.find("td", class_="stitle") title = _hardinfo2_text(st) if st else None if title: lines.append("#" * heading_level + f" {_escape_md_inline(title)}") lines.append("") def render_details(details_table, indent=2) -> List[str]: out: List[str] = [] for kind, payload in _hardinfo2_iter_rows(details_table): if kind == "sstitle": out.append(" " * indent + f"- **{_escape_md_inline(payload)}**") continue if kind == "details": for t in payload: out.extend(render_details(t, indent=indent + 2)) continue if kind == "kv": field, vals, nested = payload val_text = " | ".join(v for v in vals if v) inline, code = _inline_or_codeblock(val_text) if inline: out.append(" " * indent + f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}") else: out.append(" " * indent + f"- **{_escape_md_inline(field)}:**") out.extend(_indent_lines(code or [], indent + 4)) if nested: for t in nested: out.extend(render_details(t, indent=indent + 2)) continue if kind == "raw": txt = " | ".join(payload) inline, code = _inline_or_codeblock(txt) if inline: out.append(" " * indent + f"- {_escape_md_inline(inline)}") else: out.append(" " * indent + "-") out.extend(_indent_lines(code or [], indent + 4)) return out for kind, payload in _hardinfo2_iter_rows(table): if kind == "sstitle": lines.append("#" * (heading_level + 1) + f" {_escape_md_inline(payload)}") lines.append("") continue if kind == "details": lines.append("- **Details:**") for t in payload: lines.extend(render_details(t, indent=2)) lines.append("") continue if kind == "kv": field, vals, nested = payload val_text = " | ".join(v for v in vals if v) inline, code = _inline_or_codeblock(val_text) if inline: lines.append(f"- **{_escape_md_inline(field)}:** {_escape_md_inline(inline)}") else: lines.append(f"- **{_escape_md_inline(field)}:**") lines.extend(_indent_lines(code or [], 4)) if nested: for t in nested: lines.extend(render_details(t, indent=2)) continue if kind == "raw": txt = " | ".join(payload) inline, code = _inline_or_codeblock(txt) if inline: lines.append(f"- {_escape_md_inline(inline)}") else: lines.append("-") lines.extend(_indent_lines(code or [], 4)) lines.append("") return lines def _hardinfo2_convert(html: str, section_heading_level: int = 2) -> str: soup = _make_soup(html) root = soup.body or soup out: List[str] = [] for el in root.find_all(["h1", "table"], recursive=True): if el.name == "h1" and "title" in (el.get("class") or []): out.append("#" * section_heading_level + f" {_escape_md_inline(_hardinfo2_text(el))}") out.append("") elif el.name == "table" and el.find_parent("table") is None: out.extend(_hardinfo2_render_table(el, heading_level=section_heading_level + 1)) return "\n".join(_collapse_blank_lines(out)) + "\n" # ----------------------------- # Generic HTML → Markdown # ----------------------------- BLOCK_TAGS = { "p", "div", "section", "article", "main", "pre", "blockquote", "ul", "ol", "table", "hr", "h1", "h2", "h3", "h4", "h5", "h6", } class _Ctx: def __init__( self, base_url: Optional[str], keep_images: bool, heading_offset: int, max_heading_level: int, ): self.base_url = base_url self.keep_images = keep_images self.heading_offset = heading_offset self.max_heading_level = max_heading_level def _resolve_url(href: str, base_url: Optional[str]) -> str: if not href: return "" return urllib.parse.urljoin(base_url, href) if base_url else href def _extract_text(tag) -> str: return _norm_ws_keep_newlines(tag.get_text(" ", strip=True)) def _convert_children(tag, ctx: _Ctx) -> List[str]: from bs4 import NavigableString, Tag out: List[str] = [] for child in tag.children: if isinstance(child, NavigableString): txt = _norm_ws_keep_newlines(str(child)) if txt: out.append(txt) continue if isinstance(child, Tag): out.extend(_convert_node(child, ctx)) return out def _convert_children_inline(tag, ctx: _Ctx) -> List[str]: from bs4 import NavigableString, Tag out: List[str] = [] for child in tag.children: if isinstance(child, NavigableString): txt = _norm_ws_keep_newlines(str(child)) if txt: out.append(txt) continue if isinstance(child, Tag): out.append(_convert_inline(child, ctx)) return out def _convert_inline(tag, ctx: _Ctx) -> str: name = tag.name.lower() if name in ("span", "label", "small", "sup", "sub"): return "".join(_convert_children_inline(tag, ctx)) if name in ("strong", "b"): inner = "".join(_convert_children_inline(tag, ctx)).strip() return f"**{inner}**" if inner else "" if name in ("em", "i"): inner = "".join(_convert_children_inline(tag, ctx)).strip() return f"*{inner}*" if inner else "" if name == "code": inner = _extract_text(tag) if "`" in inner: return f"``{inner}``" return f"`{inner}`" if name == "br": return " \n" if name == "a": href = _resolve_url(tag.get("href", ""), ctx.base_url) text = "".join(_convert_children_inline(tag, ctx)).strip() or href return f"[{text}]({href})" if href else text if name == "img": if not ctx.keep_images: return "" alt = (tag.get("alt") or "image").strip() or "image" src = _resolve_url(tag.get("src", ""), ctx.base_url) return f"![{_escape_md_inline(alt)}]({src})" if src else "" return "".join(_convert_children_inline(tag, ctx)) def _table_to_bullets(table, ctx: _Ctx) -> List[str]: lines: List[str] = [] tbody = table.find("tbody", recursive=False) container = tbody if tbody else table for tr in container.find_all("tr", recursive=False): cells = tr.find_all(["th", "td"], recursive=False) if not cells: continue vals = [_extract_text(td) for td in cells] vals = [v for v in vals if v] if not vals: continue txt = " | ".join(vals) inline, code = _inline_or_codeblock(txt) if inline: lines.append(f"- {_escape_md_inline(inline)}") else: lines.append("-") lines.extend(_indent_lines(code or [], 4)) lines.append("") return lines def _table_to_md(table, ctx: _Ctx) -> List[str]: if table.find("table"): return _table_to_bullets(table, ctx) rows: List[List[str]] = [] tbody = table.find("tbody", recursive=False) container = tbody if tbody else table for tr in container.find_all("tr", recursive=False): cells = tr.find_all(["th", "td"], recursive=False) if not cells: continue rows.append([_extract_text(td) for td in cells]) if not rows: return [] has_th = bool(table.find("th")) header = rows[0] if has_th else None body_rows = rows[1:] if has_th else rows if header is None: first = rows[0] if first and all(len(c) <= 40 for c in first) and len(first) <= 6: header = first body_rows = rows[1:] else: header = [f"Col {i+1}" for i in range(len(rows[0]))] body_rows = rows col_count = max(len(header), *(len(r) for r in body_rows)) def pad(r: List[str]) -> List[str]: return (r + [""] * col_count)[:col_count] header = pad(header) lines: List[str] = [] lines.append("| " + " | ".join(_escape_table_cell(c) for c in header) + " |") lines.append("|" + "|".join(["---"] * col_count) + "|") for r in body_rows: r = pad(r) if any("\n" in c for c in r): return _table_to_bullets(table, ctx) lines.append("| " + " | ".join(_escape_table_cell(c) for c in r) + " |") lines.append("") return lines def _list_to_md(list_tag, ordered: bool, ctx: _Ctx, indent: int = 0) -> List[str]: from bs4 import Tag lines: List[str] = [] items = [li for li in list_tag.find_all("li", recursive=False) if isinstance(li, Tag)] for idx, li in enumerate(items, start=1): prefix = f"{idx}. " if ordered else "- " cont_indent = indent + len(prefix) # Build “main line” from inline-ish children and the first paragraph-ish block. main_fragments: List[str] = [] extra_blocks: List[List[str]] = [] for child in li.children: if isinstance(child, Tag) and child.name.lower() in ("ul", "ol"): continue if isinstance(child, Tag) and child.name.lower() in BLOCK_TAGS: block_lines = _convert_node(child, ctx) block_lines = _collapse_blank_lines(block_lines) if block_lines: if not main_fragments: # Use first non-empty line as the main line. first_line = next((x for x in block_lines if x.strip()), "") if first_line: main_fragments.append(first_line) rest = [x for x in block_lines if x.strip()][1:] if rest: extra_blocks.append(rest) else: extra_blocks.append([x for x in block_lines if x.strip()]) continue if isinstance(child, Tag): main_fragments.append(_convert_inline(child, ctx)) else: txt = _norm_ws_keep_newlines(str(child)) if txt: main_fragments.append(txt) main_line = _norm_ws_keep_newlines("".join(main_fragments)).strip() lines.append(" " * indent + prefix + main_line) for block in extra_blocks: for ln in block: lines.append(" " * cont_indent + ln) for child in li.find_all(["ul", "ol"], recursive=False): lines.extend(_list_to_md(child, ordered=(child.name.lower() == "ol"), ctx=ctx, indent=indent + 2)) lines.append("") return lines def _convert_node(tag, ctx: _Ctx) -> List[str]: name = tag.name.lower() if name in ("script", "style", "noscript", "template"): return [] if name in ("h1", "h2", "h3", "h4", "h5", "h6"): lvl = int(name[1]) + ctx.heading_offset lvl = max(1, min(ctx.max_heading_level, lvl)) text = "".join(_convert_children_inline(tag, ctx)).strip() text = _norm_ws_keep_newlines(text) return ["#" * lvl + f" {_escape_md_inline(text)}", ""] if text else [] if name == "p": text = "".join(_convert_children_inline(tag, ctx)).strip() text = _norm_ws_keep_newlines(text) return [text, ""] if text else [] if name == "hr": return ["---", ""] if name == "blockquote": inner = _collapse_blank_lines(_convert_children(tag, ctx)) quoted: List[str] = [] for ln in inner: quoted.append(">" if not ln.strip() else "> " + ln) quoted.append("") return quoted if name == "pre": # Try to infer a language from pre/code classes. lang = "" cls = " ".join(tag.get("class") or []) m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls) if not m: code = tag.find("code") if code: cls2 = " ".join(code.get("class") or []) m = re.search(r"language-([a-zA-Z0-9_+-]+)", cls2) if m: lang = m.group(1) code_text = tag.get_text("\n", strip=False).replace("\r\n", "\n").replace("\r", "\n") lines = code_text.splitlines() while lines and not lines[0].strip(): lines.pop(0) while lines and not lines[-1].strip(): lines.pop() return [f"```{lang}".rstrip(), *lines, "```", ""] if name == "ul": return _list_to_md(tag, ordered=False, ctx=ctx) if name == "ol": return _list_to_md(tag, ordered=True, ctx=ctx) if name == "table": return _table_to_md(tag, ctx) if name == "img": inline = _convert_inline(tag, ctx) return [inline, ""] if inline else [] if name in ("div", "section", "article", "main", "body"): return _convert_children(tag, ctx) inline = _convert_inline(tag, ctx).strip() return [inline] if inline else [] def _generic_convert( html: str, *, base_url: Optional[str], mode: str, selector: Optional[str], keep_images: bool, heading_offset: int, max_heading_level: int, ) -> str: soup = _make_soup(html) _strip_non_content(soup) root = _select_root(soup, mode=mode, selector=selector) ctx = _Ctx( base_url=base_url, keep_images=keep_images, heading_offset=heading_offset, max_heading_level=max_heading_level, ) lines = _convert_children(root, ctx) lines = _collapse_blank_lines(lines) return "\n".join(lines) + "\n" def convert_html_to_markdown( html: str, *, base_url: Optional[str] = None, mode: str = "main", # main|body selector: Optional[str] = None, keep_images: bool = False, heading_offset: int = 0, max_heading_level: int = 6, profile: str = "auto", # auto|generic|hardinfo2 hardinfo_section_level: int = 2, ) -> str: soup = _make_soup(html) _strip_non_content(soup) if profile == "hardinfo2" or (profile == "auto" and _looks_like_hardinfo2(soup)): return _hardinfo2_convert(html, section_heading_level=max(1, hardinfo_section_level)) return _generic_convert( html, base_url=base_url, mode=mode, selector=selector, keep_images=keep_images, heading_offset=heading_offset, max_heading_level=max(1, max_heading_level), ) # ----------------------------- # CLI # ----------------------------- def main(argv: Optional[Sequence[str]] = None) -> int: _require_bs4() p = argparse.ArgumentParser(description="Convert HTML (file/URL/stdin) to Jekyll-friendly Markdown") p.add_argument("input", help="Path, URL, or '-' for stdin") p.add_argument("-o", "--output", type=Path, default=None, help="Output .md file (default: stdout)") # Front matter p.add_argument("--no-front-matter", action="store_true", help="Do not emit YAML front matter") p.add_argument("--layout", default="post", help="Front matter layout (default: post)") p.add_argument("--title", default=None, help="Front matter title override") p.add_argument("--date", default=None, help="Front matter date override") p.add_argument("--date-from-meta", action="store_true", help="Try to extract date from meta/time tags") p.add_argument("--categories", default=None, help="Comma-separated categories") p.add_argument("--tags", default=None, help="Comma-separated tags") p.add_argument("--permalink", default=None, help="Front matter permalink") # Conversion knobs p.add_argument("--mode", choices=["main", "body"], default="main", help="Content selection mode") p.add_argument("--selector", default=None, help="CSS selector to choose content root (overrides --mode)") p.add_argument("--base-url", default=None, help="Base URL for resolving relative links/images") p.add_argument("--keep-images", action="store_true", help="Include <img> as Markdown images") p.add_argument("--heading-offset", type=int, default=0, help="Add to heading levels (e.g., 1 makes h1→h2)") p.add_argument("--max-heading-level", type=int, default=6, help="Clamp max heading depth (default: 6)") # Profiles p.add_argument( "--profile", choices=["auto", "generic", "hardinfo2"], default="auto", help="auto detects HardInfo2; otherwise force generic/hardinfo2", ) p.add_argument( "--hardinfo-section-level", type=int, default=2, help="Heading level for HardInfo2 section titles (default: 2)", ) args = p.parse_args(argv) html, inferred_base = _read_input(args.input) soup = _make_soup(html) # Base URL: CLI > <base href> > inferred (URL input) base_url = args.base_url or _extract_base_href(soup) or inferred_base fm = None if not args.no_front_matter: title = _extract_title(soup, args.title) date = args.date if not date and args.date_from_meta: date = _extract_date_hint(soup) if not date: date = _now_with_tz() fm = FrontMatter( layout=args.layout, title=title, date=date, categories=[c.strip() for c in (args.categories or "").split(",") if c.strip()] or None, tags=[t.strip() for t in (args.tags or "").split(",") if t.strip()] or None, permalink=args.permalink, ) md_body = convert_html_to_markdown( html, base_url=base_url, mode=args.mode, selector=args.selector, keep_images=args.keep_images, heading_offset=args.heading_offset, max_heading_level=args.max_heading_level, profile=args.profile, hardinfo_section_level=args.hardinfo_section_level, ) out_text = (fm.render() if fm else "") + md_body if args.output: args.output.parent.mkdir(parents=True, exist_ok=True) args.output.write_text(out_text, encoding="utf-8") else: sys.stdout.write(out_text) return 0 if __name__ == "__main__": raise SystemExit(main())
URL: https://ib.bsb.br/html2md