View Single Post
Old Jun 6th, 2006, 3:54 PM   #233
zem52887
Hobbyist Programmer
 
Join Date: May 2006
Posts: 127
Rep Power: 3 zem52887 is on a distinguished road
For the record here it is:
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from time import sleep
import re
import sys
import shelve

industry_page = "http://biz.yahoo.com/ic/ind_index.html"
cache = shelve.open("cache.dat")

def cached_urlopen(url):
	if not cache.has_key(url):
		cache[url] = urlopen(url).read()
	return cache[url]

def get_industry_urls(industry_page):
	soup  = BeautifulSoup(cached_urlopen(industry_page))
	links = soup.fetch("table")[7].fetch("a")
	return [a['href'] for a in links if a.string != "Alphabetical"]

def get_company_index(industry_url):
        soup  = BeautifulSoup(cached_urlopen(industry_url))
        index_link = soup.fetch("table")[11].fetch("a")[2]
        return index_link['href']
                
def get_company_urls(company_index):
        soup = BeautifulSoup(cached_urlopen(company_index))
        urls = soup.fetch("table")[21].fetch("a")        
        return [a['href'] for a in urls if "q?s" not in a['href'] and a.string != "Public" and a.string != "Private / Foreign"]

def get_company_data(company_urls):
        soup = BeautifulSoup(cached_urlopen(company_urls))
        
        #Company Name
        name = soup.firstText(re.compile("Company Profile")).replace("Company Profile - Yahoo! Finance", "")
                
        #Company Profile - Table
        profile = soup.fetchText(re.compile("Company Profile"))[2]
        companyprofile = profile.findNext("table")         
            
        #Contact Information - Table
        contact = soup.firstText(re.compile("Contact Information"))
        contacttable = contact.findParent("table")
        
            
        #Financial Highlights - Table
        highlights = soup.firstText(re.compile("Highlights"))
        fhighlights = highlights.findParent("table")
        
        if len(highlights) == 0:
            z = "N/A"

        else:
            z = fhighlights
                      
        #Key People
        key = soup.firstText(re.compile("Key People"))
        keypeople = key.findParent("table")
    
        #Public/Private
        chart = soup.fetchText(re.compile("Chart"))

        if len(chart) == 0:
            q = "<b>Priv</b>"

        else:
            q = "Pub"
        
        output = "<table border = 1>"
        output += "<tr>\n"
        output += "<td width=\"10%\">""<b>" + str(name) + "</b>""</td>"
        output += "<td width=\"35%\">" + str(companyprofile) + "</td>"
        output += "<td width=\"19.75%\">" + str(contacttable) + "</td>"
        output += "<td width=\"19.75%\">" + str(z) + "</td>"
        output += "<td width=\"15%\">" + str(keypeople) + "</td>"
        output += "<td width=\.5%\">" + str(q) + "</td>"
        output += "</tr>"
        output += "</table>"
        return output
        
file = open("data.html", "w")

file.write("<table>\n")    # \n means add a newline

for industry_url in get_industry_urls(industry_page):
        company_index = get_company_index(industry_url)
          
        for company_urls in get_company_urls(company_index):
                file.write(get_company_data(company_urls))
                print get_company_data(company_urls)

file.write("</table>\n")
file.close()

happy parsing!
zem52887 is offline   Reply With Quote