View Single Post
Old May 19th, 2006, 2:58 PM   #67
zem52887
Hobbyist Programmer
 
Join Date: May 2006
Posts: 127
Rep Power: 3 zem52887 is on a distinguished road
okay, so I tried to get the basic framework of how this works (I haven't figured out exactly which table etc each link is contained) but how does this look:

from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup

industry_page = "http://biz.yahoo.com/ic/ind_index.html"

def get_industry_urls(industry_page):
	soup  = BeautifulSoup(urlopen(industry_page))
	links = soup.fetch("table")[7].fetch("a")
	return [a['href'] for a in links]

def get_company_index(industry_url):
        soup  = BeautifulSoup(urlopen(industry_url))
        index_link = soup.fetch("table")[10].fetch("a")[3] #not sure which table, looks like the 4th link though  
        return [a for a in index_link]

#def get_company_urls(company_index):
        #soup = BeautifulSoup(urlopen(company_index))
        #urls = soup.fetch("table")[12].fetch("a") #not sure of this table either       
        #return[a for a in urls] 

#def get_company_data(company_url)
        #soup = BeautifulSoup(urlopen(company_url))
        #data[0] = soup.fetch("table")[?] #figure out these tables
        #data[1] = soup.fetch("table")[?]
        #return data 
        
#for industry_url in get_industry_urls(industry_page):
    	#company_index = get_company_index(industry_url)

	#for company_url in get_company_urls(company_index):
            #print get_company_data(company_url) #(well output but you know...)

edit: i'm not sure if this makes sense - does the first for loop create the variable industry_url and store a value into it to be passed to get_company_index?

Last edited by zem52887; May 19th, 2006 at 3:13 PM.
zem52887 is offline   Reply With Quote