from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
industry_page = "http://biz.yahoo.com/ic/ind_index.html"
def get_industry_urls(industry_page):
soup = BeautifulSoup(urlopen(industry_page))
links = soup.fetch("table")[7].fetch("a")
return [a['href'] for a in links]
industry_url = "get_industry_urls(industry_page)"
def get_company_index(industry_url):
soup = BeautifulSoup(urlopen(industry_url))
index_link = soup.fetch("table")[11].fetch("a")[2]
return index_link['href']
for industry_url in get_industry_urls(industry_page):
company_index = get_company_index(industry_url)
print get_company_index(industry_url)
wait... is that better?
this is absolutely amazing, it's listing out every single company index page... it's taking awhile but i can't complain. whoa wait it just turned red on me and I got a bunch of errors
*tear it was going so nicely...
hm that's strange... it's listing out all the company index links but the first link it lists is:
don't know why that's happening