面向过程的方式
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import sys
import re
import os
type =
sys.getfilesystemencoding()
if __name__ ==
'__main__':
# 1.访问其中一个网页地址,获取网页源代码
url =
'http://www.qiushibaike.com/textnew/'
user_agent =
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
headers = {
'User-Agent': user_agent}
try:
req = urllib2.Request(url=url, headers=
headers)
res =
urllib2.urlopen(req)
html = res.read().decode(
"UTF-8").encode(type)
except urllib2.HTTPError as e:
print e
exit()
except urllib2.URLError as e:
print e
exit()
# 2.根据抓取到的网页源代码去提取想要的数据,帖子id,帖子内容
regex_content =
re.compile(
'<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>',
re.S)
items =
re.findall(regex_content, html)
for item
in items:
file_name = item[0].strip(
'\'')
content = item[1].strip().lstrip(
'<span>').rstrip(
'</span>').replace(
'\n',
'').replace(
'<br/>',
'\n')
# 3.保存抓取的数据到文件中
path =
'qiubai'
if not os.path.exists(path):
os.makedirs(path)
file_path = path +
'/' + file_name +
'.txt'
with open(file_path, 'w') as fp:
fp.write(content)
fp.close()
面向对象的方式
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sys
type =
sys.getfilesystemencoding()
class Spider:
def __init__(self):
self.url =
'http://www.qiushibaike.com/textnew/page/%s/?s=4979315'
self.user_agent =
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
# 获取网页源代码
def get_page(self, page_index):
headers = {
'User-Agent': self.user_agent}
try:
req = urllib2.Request(url=self.url % str(page_index), headers=
headers)
res =
urllib2.urlopen(req)
html = res.read().decode(
"UTF-8").encode(type)
return html
except urllib2.HTTPError as e:
print e
exit()
except urllib2.URLError as e:
print e
exit()
# 分析网页源代码
def analysis(self, html):
regex_content =
re.compile(
'<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>',
re.S)
items =
re.findall(regex_content, html)
return items
# 保存抓取的数据到文件中
def save(self, items, path):
if not os.path.exists(path):
os.makedirs(path)
for item
in items:
file_name = item[0].strip(
'\'')
content = item[1].strip().lstrip(
'<span>').rstrip(
'</span>').replace(
'\n',
'').replace(
'<br/>',
'\n')
file_path = path +
'/' + file_name +
'.txt'
with open(file_path, 'w') as fp:
fp.write(content)
fp.close()
# 运行的方法
def run(self):
print u
'开始抓取内容...'
for i
in range(1, 3
):
content =
self.get_page(i)
items =
self.analysis(content)
self.save(items, 'qiubai')
print u
'内容抓取完毕...'
if __name__ ==
'__main__':
sp =
Spider()
sp.run()
***微信扫一扫,关注“python测试开发圈”,了解更多测试教程!***
转载于:https://www.cnblogs.com/guanfuchang/p/6802191.html