wordpress_export_extract_md.py (raw)
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "pandoc",
# ]
# ///
from pathlib import Path
from string import Template
import pandoc
from collections import defaultdict
import shutil
import xml.etree.ElementTree as ET
from datetime import datetime
def get_child_of(elem, tag):
for child in elem:
if tag in child.tag:
return child
return None
def text_orelse(elem, other=None):
if elem != None:
if elem.text == None:
return other
else:
return elem.text
else:
return other
def standalone_readable_date(date):
return date.strftime("%A, %B %d, %Y")
##################
### Output dir ###
##################
out = Path("out")
shutil.rmtree(out)
out.mkdir(exist_ok=True)
num_dropped = 0
tags = []
slugs = []
toc = defaultdict(list)
post_dates = {}
#####################################################
### Fill in the path to your own export file here ###
#####################################################
tree = ET.parse(
"wordpressContentExport/knightsofthecompiler.wordpress.2024-12-04.000.xml"
)
root = tree.getroot()
channel = root[0]
for child in channel:
tags += [child.tag]
if child.tag == "item":
item = child
date = text_orelse(get_child_of(item, "post_date"))
date = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
title = text_orelse(get_child_of(item, "title"), "Untitled")
assert title != None
content = text_orelse(get_child_of(item, "content"))
attachment_url = text_orelse(get_child_of(item, "attachment_url"))
post_id = get_child_of(item, "post_id").text
post_type = get_child_of(item, "post_type").text
slug = title
assert slug != None
slug = (
slug.lower()
.replace(":", "")
.replace("'", "")
.replace("(", "")
.replace(")", "")
.replace("!", "")
.replace(",", "")
.replace(".", "")
.replace("#", "")
.replace("%", "")
.replace("the", "")
.replace(" ", " ")
)
slug = slug.strip()
slug = slug.replace(" ", "-")
if slug in slugs:
i = 1
next_slug = f"{slug}-{i}"
while next_slug in slugs:
i += 1
next_slug = f"{slug}-{i}"
slug = next_slug
slugs += [slug]
print(f"{post_id}: {slug} ({date})")
if content == None:
content = "???? no content ????"
else:
content = pandoc.write(
pandoc.read(content, options=["--from", "html"]),
options=["--to", "markdown"],
)
item_path = Path(post_type) / slug
(out / item_path).mkdir(exist_ok=True, parents=True)
# Basic template used for each page, post, etc.
((out / item_path) / "index.md").write_text(
f"""
---
title: "{title}"
template: template/page.mako
kind: kotc
---
*Date published: {standalone_readable_date(date)}*
{content}
"""
)
toc[post_type] += [
{"path": item_path, "date": date, "id": post_id, "title": title}
]
print(f"---\nNum slugs: {len(slugs)}")
print(f"Num unique slugs: {len(set(slugs))}")
print("---\nTags:")
for tag in set(tags):
print(f"- {tag}")
def make_post_entry(item):
return f"""*{standalone_readable_date(item["date"])}*
[{item["title"]}]({item["path"]})"""
toc["page"] = sorted(toc["page"], key=lambda page: page["title"])
toc["post"] = sorted(toc["post"], key=lambda post: post["date"])[::-1]
pages = "\n".join(f"- [{page["title"]}]({page["path"]})" for page in toc["page"])
posts = "\n\n".join(make_post_entry(post) for post in toc["post"])
# Template for main index file of export.
index_md = f"""
---
title: "Archived Blog: Knights of the Compiler"
template: template/page.mako
kind: kotc
---
Here's my previous blog! TODO
# Pages
{pages}
# Posts
{posts}
"""
(out / "index.md").write_text(index_md)