wordpress_export_extract_md.py (raw)


#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "pandoc",
# ]
# ///

from pathlib import Path
from string import Template
import pandoc
from collections import defaultdict
import shutil
import xml.etree.ElementTree as ET
from datetime import datetime


def get_child_of(elem, tag):
    for child in elem:
        if tag in child.tag:
            return child
    return None


def text_orelse(elem, other=None):
    if elem != None:
        if elem.text == None:
            return other
        else:
            return elem.text
    else:
        return other


def standalone_readable_date(date):
    return date.strftime("%A, %B %d, %Y")

##################
### Output dir ###
##################

out = Path("out")
shutil.rmtree(out)
out.mkdir(exist_ok=True)
num_dropped = 0

tags = []
slugs = []
toc = defaultdict(list)
post_dates = {}

#####################################################
### Fill in the path to your own export file here ###
#####################################################

tree = ET.parse(
    "wordpressContentExport/knightsofthecompiler.wordpress.2024-12-04.000.xml"
)

root = tree.getroot()
channel = root[0]
for child in channel:
    tags += [child.tag]
    if child.tag == "item":
        item = child
        date = text_orelse(get_child_of(item, "post_date"))
        date = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
        title = text_orelse(get_child_of(item, "title"), "Untitled")
        assert title != None
        content = text_orelse(get_child_of(item, "content"))
        attachment_url = text_orelse(get_child_of(item, "attachment_url"))
        post_id = get_child_of(item, "post_id").text
        post_type = get_child_of(item, "post_type").text

        slug = title
        assert slug != None

        slug = (
            slug.lower()
            .replace(":", "")
            .replace("'", "")
            .replace("(", "")
            .replace(")", "")
            .replace("!", "")
            .replace(",", "")
            .replace(".", "")
            .replace("#", "")
            .replace("%", "")
            .replace("the", "")
            .replace("  ", " ")
        )
        slug = slug.strip()
        slug = slug.replace(" ", "-")

        if slug in slugs:
            i = 1
            next_slug = f"{slug}-{i}"
            while next_slug in slugs:
                i += 1
                next_slug = f"{slug}-{i}"
            slug = next_slug
        slugs += [slug]
        print(f"{post_id}: {slug} ({date})")

        if content == None:
            content = "???? no content ????"
        else:
            content = pandoc.write(
                pandoc.read(content, options=["--from", "html"]),
                options=["--to", "markdown"],
            )

        item_path = Path(post_type) / slug
        (out / item_path).mkdir(exist_ok=True, parents=True)
  
        # Basic template used for each page, post, etc.
        ((out / item_path) / "index.md").write_text(
            f"""
---
title: "{title}"
template: template/page.mako
kind: kotc
---

*Date published: {standalone_readable_date(date)}*

{content}
"""
        )

        toc[post_type] += [
            {"path": item_path, "date": date, "id": post_id, "title": title}
        ]

print(f"---\nNum slugs: {len(slugs)}")
print(f"Num unique slugs: {len(set(slugs))}")

print("---\nTags:")
for tag in set(tags):
    print(f"- {tag}")


def make_post_entry(item):
    return f"""*{standalone_readable_date(item["date"])}*  
[{item["title"]}]({item["path"]})"""


toc["page"] = sorted(toc["page"], key=lambda page: page["title"])
toc["post"] = sorted(toc["post"], key=lambda post: post["date"])[::-1]

pages = "\n".join(f"- [{page["title"]}]({page["path"]})" for page in toc["page"])
posts = "\n\n".join(make_post_entry(post) for post in toc["post"])

# Template for main index file of export.
index_md = f"""
---
title: "Archived Blog: Knights of the Compiler"
template: template/page.mako
kind: kotc
---

Here's my previous blog! TODO

# Pages

{pages}

# Posts

{posts}
        
"""

(out / "index.md").write_text(index_md)