受天天学长影响入坑python,颓了半天多+一晚上终于写出爬虫了,感觉NOIP药丸qaq
在慕课上学的,本来想抓关于OI的百度百科的,因为图样,目标函数太简单,刚开始就偏了……
结果界面也很丑,下回搞个更好的OvO
收货了很多经验,最有用的一条就是把主方法中抓取失败的except写成
except Exception as f:
print 'crew failed: ', f
就可以查出失败的大致方向
python有些东西打错了也不会报错
现在依旧经常犯新手经典错误:self没打或者打错
贴出爬虫的代码(几乎完全是照着慕课上的模板抄的一,一):
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
![](https://images.cnblogs.com/OutliningIndicators/ExpandedBlockStart.gif)
1 主要 2 3 import url_manager, html_downloader, html_parser, html_outputer 4 5 class SpiderMain(object): 6 def __init__(self): 7 self.urls = url_manager.UrlManager() 8 self.downloader = html_downloader.HtmlDownloader() 9 self.parser = html_parser.HtmlParser() 10 self.outputer = html_outputer.HtmlOutputer() 11 12 def craw(self, root_url): 13 count = 1 14 self.urls.add_new_url(root_url) 15 while self.urls.has_new_url(): 16 try: 17 new_url = self.urls.get_new_url() 18 print 'craw %d : %s' % (count, new_url) 19 html_cont = self.downloader.download(new_url) 20 new_urls, new_data = self.parser.parse(new_url, html_cont) 21 self.urls.add_new_urls(new_urls) 22 self.outputer.collect_data(new_data) 23 except Exception as f:#这么做可以指出抓取失败的大致方向 24 print 'crew failed: ', f 25 26 if count == 100: 27 break 28 29 count = count + 1 30 31 self.outputer.output_html() 32 33 34 if __name__=="__main__": 35 root_url = "http://baike.baidu.com/item/oi/74020" 36 boj_spider = SpiderMain() 37 boj_spider.craw(root_url) 38 39 40 输出器 41 42 43 class HtmlOutputer(object): 44 def __init__(self): 45 self.datas = [] 46 47 def collect_data(self, data): 48 if data is None: 49 return 50 self.datas.append(data) 51 52 def output_html(self): 53 fout = open('output.html', 'w') 54 55 fout.write("") 56 fout.write("") 57 fout.write(" ") 58 fout.write("
%s | " % data['url']) 63 fout.write("%s | " % data['title'].encode('utf-8')) 64 fout.write("%s | " % data['summary'].encode('utf-8'))#old error:didn't have encode 65 fout.write("
Python
130 title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")131 res_data['title'] = title_node.get_text()132 133 # 134 summary_node = soup.find('div', class_="lemma-summary")135 res_data['summary'] = summary_node.get_text()136 137 return res_data138 139 def parse(self, page_url, html_cont):140 if page_url is None or html_cont is None:141 return142 143 soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')144 new_urls = self._get_new_urls(page_url, soup)145 new_data = self._get_new_data(page_url, soup)#old error:new_urls =146 return new_urls, new_data147 148 149 150 下载器151 152 import urllib2153 154 class HtmlDownloader(object):155 156 def download(self, url):157 if url is None:158 return None159 160 response = urllib2.urlopen(url)161 162 if response.getcode() != 200:163 return None164 165 return response.read()