scrape_sesamstraat_episodes.py (raw)
# Tested with: python 3.12.3
import xml.etree.ElementTree as ET
from pathlib import Path
import json
import argparse
import urllib.request
def http_get(url):
with urllib.request.urlopen(url) as response:
return response.read().decode("utf-8")
def get_next_data(html):
root = ET.fromstring(html)
next_data_elem = None
for script in root.iter("script"):
if script.attrib.get("id") == "__NEXT_DATA__":
return json.loads(script.text)
assert False
def get_queries(data):
return data["props"]["pageProps"]["dehydratedState"]["queries"]
def get_seasons(series_slug):
url = f"https://npo.nl/start/serie/{series_slug}"
html = http_get(url)
data = get_next_data(html)
queries = get_queries(data)
for query in queries:
if f"series:seasons-{series_slug}-include-premium" in query.get("queryKey"):
return query["state"]["data"]
assert False
def get_episodes(series_slug, season_data):
url = f"https://npo.nl/start/serie/{series_slug}/afleveringen/{season_data["slug"]}"
html = http_get(url)
data = get_next_data(html)
queries = get_queries(data)
Path("./season_afleveringen.json").write_text(json.dumps(queries, indent=2))
episode_query_key = (
f"programs:season-{season_data["guid"]}-include-premium-timeless_series"
)
for query in queries:
if episode_query_key in query["queryKey"]:
return query["state"]["data"]
assert False
def main():
series_slug = "sesamstraat"
seasons = get_seasons(series_slug)
episode_urls = []
for season in seasons:
for episode in get_episodes(series_slug, season):
play_url = f"https://npo.nl/start/afspelen/{episode["slug"]}"
episode_urls += [play_url]
print(json.dumps(episode_urls, indent=2))
if __name__ == "__main__":
main()