scrape_sesamstraat_episodes.py (raw)


# Tested with: python 3.12.3

import xml.etree.ElementTree as ET
from pathlib import Path
import json
import argparse
import urllib.request

def http_get(url):
    with urllib.request.urlopen(url) as response:
        return response.read().decode("utf-8")


def get_next_data(html):
    root = ET.fromstring(html)
    next_data_elem = None
    for script in root.iter("script"):
        if script.attrib.get("id") == "__NEXT_DATA__":
            return json.loads(script.text)

    assert False


def get_queries(data):
    return data["props"]["pageProps"]["dehydratedState"]["queries"]


def get_seasons(series_slug):
    url = f"https://npo.nl/start/serie/{series_slug}"
    html = http_get(url)
    data = get_next_data(html)
    queries = get_queries(data)
    for query in queries:
        if f"series:seasons-{series_slug}-include-premium" in query.get("queryKey"):
            return query["state"]["data"]

    assert False


def get_episodes(series_slug, season_data):
    url = f"https://npo.nl/start/serie/{series_slug}/afleveringen/{season_data["slug"]}"
    html = http_get(url)
    data = get_next_data(html)
    queries = get_queries(data)
    Path("./season_afleveringen.json").write_text(json.dumps(queries, indent=2))
    episode_query_key = (
        f"programs:season-{season_data["guid"]}-include-premium-timeless_series"
    )
    for query in queries:
        if episode_query_key in query["queryKey"]:
            return query["state"]["data"]

    assert False


def main():
    series_slug = "sesamstraat"
    seasons = get_seasons(series_slug)
    episode_urls = []
    for season in seasons:
        for episode in get_episodes(series_slug, season):
            play_url = f"https://npo.nl/start/afspelen/{episode["slug"]}"
            episode_urls += [play_url]

    print(json.dumps(episode_urls, indent=2))


if __name__ == "__main__":
    main()