okay, so I tried to get the basic framework of how this works (I haven't figured out exactly which table etc each link is contained) but how does this look:
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
industry_page = "http://biz.yahoo.com/ic/ind_index.html"
def get_industry_urls(industry_page):
soup = BeautifulSoup(urlopen(industry_page))
links = soup.fetch("table")[7].fetch("a")
return [a['href'] for a in links]
def get_company_index(industry_url):
soup = BeautifulSoup(urlopen(industry_url))
index_link = soup.fetch("table")[10].fetch("a")[3] #not sure which table, looks like the 4th link though
return [a for a in index_link]
#def get_company_urls(company_index):
#soup = BeautifulSoup(urlopen(company_index))
#urls = soup.fetch("table")[12].fetch("a") #not sure of this table either
#return[a for a in urls]
#def get_company_data(company_url)
#soup = BeautifulSoup(urlopen(company_url))
#data[0] = soup.fetch("table")[?] #figure out these tables
#data[1] = soup.fetch("table")[?]
#return data
#for industry_url in get_industry_urls(industry_page):
#company_index = get_company_index(industry_url)
#for company_url in get_company_urls(company_index):
#print get_company_data(company_url) #(well output but you know...)
edit: i'm not sure if this makes sense - does the first for loop create the variable industry_url and store a value into it to be passed to get_company_index?