For the record here it is:
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from time import sleep
import re
import sys
import shelve
industry_page = "http://biz.yahoo.com/ic/ind_index.html"
cache = shelve.open("cache.dat")
def cached_urlopen(url):
if not cache.has_key(url):
cache[url] = urlopen(url).read()
return cache[url]
def get_industry_urls(industry_page):
soup = BeautifulSoup(cached_urlopen(industry_page))
links = soup.fetch("table")[7].fetch("a")
return [a['href'] for a in links if a.string != "Alphabetical"]
def get_company_index(industry_url):
soup = BeautifulSoup(cached_urlopen(industry_url))
index_link = soup.fetch("table")[11].fetch("a")[2]
return index_link['href']
def get_company_urls(company_index):
soup = BeautifulSoup(cached_urlopen(company_index))
urls = soup.fetch("table")[21].fetch("a")
return [a['href'] for a in urls if "q?s" not in a['href'] and a.string != "Public" and a.string != "Private / Foreign"]
def get_company_data(company_urls):
soup = BeautifulSoup(cached_urlopen(company_urls))
#Company Name
name = soup.firstText(re.compile("Company Profile")).replace("Company Profile - Yahoo! Finance", "")
#Company Profile - Table
profile = soup.fetchText(re.compile("Company Profile"))[2]
companyprofile = profile.findNext("table")
#Contact Information - Table
contact = soup.firstText(re.compile("Contact Information"))
contacttable = contact.findParent("table")
#Financial Highlights - Table
highlights = soup.firstText(re.compile("Highlights"))
fhighlights = highlights.findParent("table")
if len(highlights) == 0:
z = "N/A"
else:
z = fhighlights
#Key People
key = soup.firstText(re.compile("Key People"))
keypeople = key.findParent("table")
#Public/Private
chart = soup.fetchText(re.compile("Chart"))
if len(chart) == 0:
q = "<b>Priv</b>"
else:
q = "Pub"
output = "<table border = 1>"
output += "<tr>\n"
output += "<td width=\"10%\">""<b>" + str(name) + "</b>""</td>"
output += "<td width=\"35%\">" + str(companyprofile) + "</td>"
output += "<td width=\"19.75%\">" + str(contacttable) + "</td>"
output += "<td width=\"19.75%\">" + str(z) + "</td>"
output += "<td width=\"15%\">" + str(keypeople) + "</td>"
output += "<td width=\.5%\">" + str(q) + "</td>"
output += "</tr>"
output += "</table>"
return output
file = open("data.html", "w")
file.write("<table>\n") # \n means add a newline
for industry_url in get_industry_urls(industry_page):
company_index = get_company_index(industry_url)
for company_urls in get_company_urls(company_index):
file.write(get_company_data(company_urls))
print get_company_data(company_urls)
file.write("</table>\n")
file.close()
happy parsing!