microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
f46fff4e5103217703b51e27ba3f6405ac000e21

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/nprData/generate_data.py

55lines · modecode

1# Copyright (c) Microsoft Corporation and Henry Lucco.
2# Licensed under the MIT License.
3
4import requests
5from bs4 import BeautifulSoup
6from structs import Episode
7import json
8
9URL = 'https://www.npr.org/programs/all-things-considered/archive'
10BASE_URL = 'https://www.npr.org'
11
12def get_podcast_links(page_url):
13 response = requests.get(page_url)
14 soup = BeautifulSoup(response.text, 'html.parser')
15
16 archive_links = []
17 for page_link in soup.find_all('a', href=True):
18 link = page_link['href']
19 if '/programs/all-things-considered/archive' in link:
20 archive_links.append(BASE_URL + link)
21
22 episode_links = []
23 for i, archive_link in enumerate(archive_links):
24 response = requests.get(archive_link)
25 soup = BeautifulSoup(response.text, 'html.parser')
26
27 for episode_link in soup.find_all('a', href=True):
28 episode_link_href = episode_link['href']
29 if '/programs/all-things-considered/' in episode_link_href and "archive" not in episode_link_href and episode_link_href != "/programs/all-things-considered/":
30 episode_links.append(episode_link_href)
31
32 print(f"Processed archive page {archive_link} [{i}/{len(archive_links)}] with {len(episode_links)} episodes")
33
34 return episode_links
35
36
37if __name__ == "__main__":
38 # Scrape the archive page
39 podcast_links = get_podcast_links(URL)
40 print(f"Found {len(podcast_links)} podcast episodes to process")
41
42 # For each podcast episode, extract the transcript
43 output_episodes = []
44 for i, podcast_link in enumerate(podcast_links):
45 print(podcast_link)
46 try:
47 episode = Episode.from_link(podcast_link)
48 output_episodes.append(episode.to_dict())
49 except Exception as e:
50 print(f"Error processing episode {podcast_link}: {e}")
51 continue
52
53 print(f"Processed episode {episode.id} [{i}/{len(podcast_links)}] with {len(episode.sections)} sections")
54 with open("npr.json", "w") as f:
55 json.dump(output_episodes, f, indent=4)