In this post I will show how to write a simple script to scrape a webpage with a list of podcasts links. I did that while preparing to go to Austin. It is a nice way to use my extra "airport time" to study a little bit.
The first step is to list all the links in the podcast webpage,
import requests
from bs4 import BeautifulSoup, SoupStrainer
def urllister(url):
r = requests.get(url)
soup = r.content
urls = []
for link in BeautifulSoup(soup, parse_only=SoupStrainer('a')):
try:
if link.has_attr('href'):
urls.append(link['href'])
except AttributeError:
pass
return urls
and filter it by the file extension you want to download:
import fnmatch
def filter_url(urls, filetype="*.mp3"):
return (fname for fname in fnmatch.filter(urls, filetype))
Now we need to create a download function. I do not remember where I got the function below. It is probably a mixture of StackOverflow and some customizations. The beauty of this function is that it can resume a partial download and displays a nice progress bar.
import os
import sys
try:
from urllib.error import HTTPError
from urllib.request import FancyURLopener
except ImportError:
from urllib2 import HTTPError
from urllib import FancyURLopener
from progressbar import ProgressBar
class URLOpener(FancyURLopener):
"""Subclass to override error 206 (partial file being sent)."""
def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
pass # Ignore the expected "non-error" code.
def download(fname, url, verbose=False):
"""Resume download."""
current_size = 0
url_obj = URLOpener()
if os.path.exists(fname):
output = open(fname, "ab")
current_size = os.path.getsize(fname)
# If the file exists, then download only the remainder.
url_obj.addheader("Range", "bytes=%s-" % (current_size))
else:
output = open(fname, "wb")
web_page = url_obj.open(url)
if verbose:
for key, value in web_page.headers.items():
sys.stdout.write("{} = {}\n".format(key, value))
# If we already have the whole file, there is no need to download it again.
num_bytes = 0
full_size = int(web_page.headers['Content-Length'])
if full_size == current_size:
msg = "File ({}) was already downloaded from URL ({})".format
sys.stdout.write(msg(fname, url))
elif full_size == 0:
sys.stdout.write("Full file size equal zero!"
"Try again later or check the file")
else:
if verbose:
msg = "Downloading {:d} more bytes".format
sys.stdout.write(msg(full_size - current_size))
pbar = ProgressBar(maxval=full_size)
pbar.start()
while True:
try:
data = web_page.read(8192)
except ValueError:
break
if not data:
break
output.write(data)
num_bytes = num_bytes + len(data)
pbar.update(num_bytes)
pbar.finish()
web_page.close()
output.close()
if verbose:
msg = "Downloaded {} bytes from {}".format
sys.stdout.write(msg(num_bytes, web_page.url))
Now find a URL with the podcasts you want and start scrapping. Be nice and sleep a little bit before each download!
from time import sleep
podcasts = range(0, 101)
uri = ("http://some-url-with-podcasts/podcast-{}.mp3".format)
for podcast in podcasts:
url = uri(podcast)
print(url + '\n')
try:
fname = url.split('/')[-1]
download(fname, url, verbose=True)
except HTTPError:
print('Cannot download {}\n'.format(url))
print('\n')
sleep(2)
Be sure to read the page terms of use. Some podcasts providers do not like scrapping!!
I will be listening to some Spanish classes. Nope, just lost my phone at
the airport... I won't be listening to anything :-(
HTML(html)