microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
python/nprData/generate_data.py
55lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | import requests |
| 5 | from bs4 import BeautifulSoup |
| 6 | from structs import Episode |
| 7 | import json |
| 8 | |
| 9 | URL = 'https://www.npr.org/programs/all-things-considered/archive' |
| 10 | BASE_URL = 'https://www.npr.org' |
| 11 | |
| 12 | def get_podcast_links(page_url): |
| 13 | response = requests.get(page_url) |
| 14 | soup = BeautifulSoup(response.text, 'html.parser') |
| 15 | |
| 16 | archive_links = [] |
| 17 | for page_link in soup.find_all('a', href=True): |
| 18 | link = page_link['href'] |
| 19 | if '/programs/all-things-considered/archive' in link: |
| 20 | archive_links.append(BASE_URL + link) |
| 21 | |
| 22 | episode_links = [] |
| 23 | for i, archive_link in enumerate(archive_links): |
| 24 | response = requests.get(archive_link) |
| 25 | soup = BeautifulSoup(response.text, 'html.parser') |
| 26 | |
| 27 | for episode_link in soup.find_all('a', href=True): |
| 28 | episode_link_href = episode_link['href'] |
| 29 | if '/programs/all-things-considered/' in episode_link_href and "archive" not in episode_link_href and episode_link_href != "/programs/all-things-considered/": |
| 30 | episode_links.append(episode_link_href) |
| 31 | |
| 32 | print(f"Processed archive page {archive_link} [{i}/{len(archive_links)}] with {len(episode_links)} episodes") |
| 33 | |
| 34 | return episode_links |
| 35 | |
| 36 | |
| 37 | if __name__ == "__main__": |
| 38 | # Scrape the archive page |
| 39 | podcast_links = get_podcast_links(URL) |
| 40 | print(f"Found {len(podcast_links)} podcast episodes to process") |
| 41 | |
| 42 | # For each podcast episode, extract the transcript |
| 43 | output_episodes = [] |
| 44 | for i, podcast_link in enumerate(podcast_links): |
| 45 | print(podcast_link) |
| 46 | try: |
| 47 | episode = Episode.from_link(podcast_link) |
| 48 | output_episodes.append(episode.to_dict()) |
| 49 | except Exception as e: |
| 50 | print(f"Error processing episode {podcast_link}: {e}") |
| 51 | continue |
| 52 | |
| 53 | print(f"Processed episode {episode.id} [{i}/{len(podcast_links)}] with {len(episode.sections)} sections") |
| 54 | with open("npr.json", "w") as f: |
| 55 | json.dump(output_episodes, f, indent=4) |