microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

f46fff4e5103217703b51e27ba3f6405ac000e21

Find a branch or tag

Branches

f46fff4e5103217703b51e27ba3f6405ac000e21

Clone

HTTPS

Download ZIP

TypeAgent/python/nprData

python/nprData/generate_data.py

55lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`# Copyright (c) Microsoft Corporation and Henry Lucco.`
2	`# Licensed under the MIT License.`
3
4	`import requests`
5	`from bs4 import BeautifulSoup`
6	`from structs import Episode`
7	`import json`
8
9	`URL = 'https://www.npr.org/programs/all-things-considered/archive'`
10	`BASE_URL = 'https://www.npr.org'`
11
12	`def get_podcast_links(page_url):`
13	`response = requests.get(page_url)`
14	`soup = BeautifulSoup(response.text, 'html.parser')`
15
16	`archive_links = []`
17	`for page_link in soup.find_all('a', href=True):`
18	`link = page_link['href']`
19	`if '/programs/all-things-considered/archive' in link:`
20	`archive_links.append(BASE_URL + link)`
21
22	`episode_links = []`
23	`for i, archive_link in enumerate(archive_links):`
24	`response = requests.get(archive_link)`
25	`soup = BeautifulSoup(response.text, 'html.parser')`
26
27	`for episode_link in soup.find_all('a', href=True):`
28	`episode_link_href = episode_link['href']`
29	`if '/programs/all-things-considered/' in episode_link_href and "archive" not in episode_link_href and episode_link_href != "/programs/all-things-considered/":`
30	`episode_links.append(episode_link_href)`
31
32	`print(f"Processed archive page {archive_link} [{i}/{len(archive_links)}] with {len(episode_links)} episodes")`
33
34	`return episode_links`
35
36
37	`if __name__ == "__main__":`
38	`# Scrape the archive page`
39	`podcast_links = get_podcast_links(URL)`
40	`print(f"Found {len(podcast_links)} podcast episodes to process")`
41
42	`# For each podcast episode, extract the transcript`
43	`output_episodes = []`
44	`for i, podcast_link in enumerate(podcast_links):`
45	`print(podcast_link)`
46	`try:`
47	`episode = Episode.from_link(podcast_link)`
48	`output_episodes.append(episode.to_dict())`
49	`except Exception as e:`
50	`print(f"Error processing episode {podcast_link}: {e}")`
51	`continue`
52
53	`print(f"Processed episode {episode.id} [{i}/{len(podcast_links)}] with {len(episode.sections)} sections")`
54	`with open("npr.json", "w") as f:`
55	`json.dump(output_episodes, f, indent=4)`

microsoft/TypeAgent

Branches

Tags

Clone