林子雨编著《数据采集与预处理》教材配套代码(教材官网)
查看所有章节代码
第3章 网络数据采集
web_demo.html
<html>
<head><title>搜索指数</title></head>
<body>
<table>
<tr><td>排名</td><td>关键词</td><td>搜索指数</td></tr>
<tr><td>1</td><td>大数据</td><td>187767</td></tr>
<tr><td>2</td><td>云计算</td><td>178856</td></tr>
<tr><td>3</td><td>物联网</td><td>122376</td></tr>
</table>
</body>
</html>
>>> import urllib.request
>>> response=urllib.request.urlopen("http://www.baidu.com")
>>> html=response.read()
>>> print(html)
>>> import urllib.parse
>>> import urllib.request
>>> # 1.指定url
>>> url = 'https://fanyi.baidu.com/sug'
>>> # 2.发起POST请求之前,要处理POST请求携带的参数
>>> # 2.1 将POST请求封装到字典
>>> data = {'kw':'苹果',}
>>> # 2.2 使用parse模块中的urlencode(返回值类型是字符串类型)进行编码处理
>>> data = urllib.parse.urlencode(data)
>>> # 将步骤2.2的编码结果转换成byte类型
>>> data = data.encode()
>>> # 3.发起POST请求:urlopen函数的data参数表示的就是经过处理之后的POST请求携带的参数
>>> response = urllib.request.urlopen(url=url,data=data)
>>> data = response.read()
>>> print(data)
b'{"errno":0,"data":[{"k":"\\u82f9\\u679c","v":"\\u540d. apple"},{"k":"\\u82f9\\u679c\\u56ed","v":"apple grove"},{"k":"\\u82f9\\u679c\\u5934","v":"apple head"},{"k":"\\u82f9\\u679c\\u5e72","v":"[\\u533b]dried apple"},{"k":"\\u82f9\\u679c\\u6728","v":"applewood"}]}'
> pip install urllib3
>>> import urllib3
>>> #需要一个PoolManager实例来生成请求,由该实例对象处理与线程池的连接以及线程安全的所有细节,不需要任何人为操作
>>> http = urllib3.PoolManager()
>>> response = http.request('GET','http://www.baidu.com')
>>> print(response.status)
>>> print(response.data)
>>> import urllib3
>>> http = urllib3.PoolManager()
>>> response = http.request('POST',
'https://fanyi.baidu.com/sug'
,fields={'kw':'苹果',})
>>> print(response.data)
> pip install requests
>>> import requests
>>> response = requests.get('http://www.baidu.com') #对需要爬取的网页发送请求
>>> print('状态码:',response.status_code) #打印状态码
>>> print('url:',response.url) #打印请求url
>>> print('header:',response.headers) #打印头部信息
>>> print('cookie:',response.cookies) #打印cookie信息
>>> print('text:',response.text) #以文本形式打印网页源码
>>> print('content:',response.content) #以字节流形式打印网页源码
>>> import requests
>>> #导入模块
>>> import requests
>>> #表单参数
>>> data = {'kw':'苹果',}
>>> #对需要爬取的网页发送请求
>>> response = requests.post('https://fanyi.baidu.com/sug',data=data)
>>> #以字节流形式打印网页源码
>>> print(response.content)
>>> import requests
>>> base_url = 'http://httpbin.org'
>>> param_data = {'user':'xmu','password':'123456'}
>>> response = requests.get(base_url+'/get',params=param_data)
>>> print(response.url)
http://httpbin.org/get?user=xmu&password=123456
>>> print(response.status_code)
200
>>> import requests
>>> url='http://httpbin.org'
>>> # 创建头部信息
>>> headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
>>> response = requests.get(url,headers=headers)
>>> print(response.content)
time_out.py
# time_out.py
import requests
from requests.exceptions import ReadTimeout,ConnectTimeout
try:
response = requests.get("http://www.baidu.com", timeout=0.5)
print(response.status_code)
except ReadTimeout or ConnectTimeout:
print('Timeout')
> pip install lxml
>>> html_doc = """
<html><head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a href="http://example.com/hadoop" class="software" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="software" id="link2">Spark</a> and
<a href="http://example.com/flink" class="software" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
"""
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(html_doc,"lxml")
>>> content = soup.prettify()
>>> print(content)
<html>
<head>
<title>
BigData Software
</title>
</head>
<body>
<p class="title">
<b>
BigData Software
</b>
</p>
<p class="bigdata">
There are three famous bigdata softwares; and their names are
<a class="software" href="http://example.com/hadoop" id="link1">
Hadoop
</a>
,
<a class="software" href="http://example.com/spark" id="link2">
Spark
</a>
and
<a class="software" href="http://example.com/flink" id="link3">
Flink
</a>
;
and they are widely used in real applications.
</p>
<p class="bigdata">
...
</p>
</body>
</html>
soup = BeautifulSoup(html_doc,"html.parser")
>>> print(soup.a)
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>
>>> print(soup.title)
<title>BigData Software</title>
>>> print(soup.name)
[document]
>>> print(soup.p.attrs)
{'class': ['title']}
>>> print(soup.p['class'])
['title']
>>> print(soup.p.get('class'))
['title']
>>> print(soup.p.string)
BigData Software
>>> print(type(soup.p.string))
<class 'bs4.element.NavigableString'>
>>> print(type(soup.name))
<class 'str'>
>>> print(soup.name)
[document]
>>> print(soup.attrs)
{}
bs4_example.py
# bs4_example.py
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,"lxml")
print(soup.a)
print(soup.a.string)
print(type(soup.a.string))
>>> html_doc = """
<html><head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a href="http://example.com/hadoop" class="software" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="software" id="link2">Spark</a> and
<a href="http://example.com/flink" class="software" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
"""
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(html_doc,"lxml")
>>> print(soup.body.contents)
[<p class="title"><b>BigData Software</b></p>, '\n', <p class="bigdata">There are three famous bigdata softwares; and their names are
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>,
<a class="software" href="http://example.com/spark" id="link2">Spark</a> and
<a class="software" href="http://example.com/flink" id="link3">Flink</a>;
and they are widely used in real applications.</p>, '\n', <p class="bigdata">...</p>, '\n']
>>> print(soup.body.contents[0])
<p class="title"><b>BigData Software</b></p>
>>> for child in soup.body.children:
print(child)
>>> for child in soup.descendants:
print(child)
>>> print(soup.title)
<title>BigData Software</title>
>>> print(soup.title.string)
BigData Software
>>> print(soup.head)
<head><title>BigData Software</title></head>
>>> print(soup.head.string)
BigData Software
>>> print(soup.body)
<body><p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>,
<a class="software" href="http://example.com/spark" id="link2">Spark</a> and
<a class="software" href="http://example.com/flink" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
</body>
>>> print(soup.body.string)
None
>>> print(soup.strings)
<generator object Tag._all_strings at 0x0000000002C4D190>
>>> for string in soup.strings:
print(repr(string))
>>> for string in soup.stripped_strings:
print(string)
>>> p = soup.p
>>> print(p.parent.name)
Body
>>> content = soup.head.title.string
>>> print(content)
BigData Software
>>> print(content.parent.name)
title
>>> content = soup.head.title.string
>>> print(content)
BigData Software
>>> for parent in content.parents:
print(parent.name)
>>> print(soup.p.next_sibling)
>>> print(soup.p.prev_sibling)
None #没有前一个兄弟节点,返回None
>>> print(soup.p.next_sibling.next_sibling)
>>> for next in soup.a.next_siblings:
print(repr(next))
>>> print(soup.head.next_element)
<title>BigData Software</title>
>>> for element in soup.a.next_elements:
print(repr(element))
>>> print(soup.find_all('a'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> import re
>>> for tag in soup.find_all(re.compile("^b")):
print(tag)
>>> print(soup.find_all(["a","b"]))
>>> print(soup.find_all(id=True))
>>> def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
>>> print(soup.find_all(has_class_but_no_id))
>>> import re
>>> print(soup.find_all(id='link2'))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.find_all(href=re.compile("spark")))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> soup.find_all(href=re.compile("hadoop"), id='link1')
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.find_all(class_="software"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> import re
>>> print(soup.a)
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>
>>> print(soup.find_all(text="Hadoop"))
['Hadoop']
>>> print(soup.find_all(text=["Hadoop", "Spark", "Flink"]))
['Hadoop', 'Spark', 'Flink']
>>> print(soup.find_all(text="bigdata"))
[]
>>> print(soup.find_all(text="BigData Software"))
['BigData Software', 'BigData Software']
>>> print(soup.find_all(text=re.compile("bigdata")))
['There are three famous bigdata softwares; and their names are\n']
>>> print(soup.find_all("a"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.find_all("a",limit=2))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.body.find_all("a",recursive=False))
[]
>>> print(soup.select('title'))
[<title>BigData Software</title>]
>>> print(soup.select('a'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('b'))
[<b>BigData Software</b>]
>>> print(soup.select('.software'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('#link1'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select('p #link1'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select("head > title"))
[<title>BigData Software</title>]
>>> print(soup.select("p > a:nth-of-type(1)"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select("p > a:nth-of-type(2)"))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.select("p > a:nth-of-type(3)"))
[<a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('a[class="software"]'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('a[href="http://example.com/hadoop"]'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select('p a[href="http://example.com/hadoop"]'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(type(soup.select('title')))
<class 'bs4.element.ResultSet'>
>>> print(soup.select('title')[0].get_text())
BigData Software
>>> for title in soup.select('title'):
print(title.get_text())
# baidu_hot.py
import requests
from bs4 import BeautifulSoup
# 请求网页
def request_page(url,headers):
response = requests.get(url,headers=headers)
response.encoding = response.apparent_encoding
return response.text
# 解析网页
def parse_page(html):
soup = BeautifulSoup(html,'html.parser')
all_topics=soup.find_all('tr')[1:]
for each_topic in all_topics:
topic_times = each_topic.find('td',class_='last') #搜索指数
topic_rank = each_topic.find('td',class_='first') #排名
topic_name = each_topic.find('td',class_='keyword') #标题
if topic_rank != None and topic_name!=None and topic_times!=None:
topic_rank = each_topic.find('td',class_='first').get_text().replace(' ','').replace('\n','')
topic_name = each_topic.find('td',class_='keyword').get_text().replace(' ','').replace('\n','')
topic_times = each_topic.find('td',class_='last').get_text().replace(' ','').replace('\n','')
tplt = "排名:{0:^4}\t标题:{1:{3}^15}\t热度:{2:^8}"
print(tplt.format(topic_rank,topic_name,topic_times,chr(12288)))
if __name__=='__main__':
url = 'http://top.baidu.com/buzz?b=1&fr=20811'
headers = {'User-Agent':'Mozilla/5.0'}
html = request_page(url,headers)
parse_page(html)
教材中的parse_poem.py代码已经无法运行,请使用下面最新代码。
# parse_poem.py
# --*-- encoding: utf-8 --*--
# @ModuleName: parse_poem
# @Function:
# @Author: dblab
# @Time: 2022/10/25 14:36
import requests
from bs4 import BeautifulSoup
import time
# 函数1:请求网页
def page_request(url, ua):
response = requests.get(url, headers=ua)
html = response.content.decode('utf-8')
return html
# 函数2:解析网页
def page_parse(html):
soup = BeautifulSoup(html, 'lxml')
title = soup('title')
# 诗句内容:诗句+出处+链接
info = soup.select('body > div.main3 > div.left > div.sons > div.cont')
# 诗句链接
sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')
sentence_list = []
href_list = []
for i in range(len(info)):
curInfo = info[i]
poemInfo = ''
poemInfo = poemInfo.join(curInfo.get_text().split('\n'))
sentence_list.append(poemInfo)
href = sentence[i].get('href')
href_list.append("https://so.gushiwen.org" + href)
# todo sentence 和 poet数量可能不符
# sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')
# poet = soup.select('div.left > div.sons > div.cont > a:nth-of-type(2)')
# for i in range(len(sentence)):
# temp = sentence[i].get_text() + "---" + poet[i].get_text()
# sentence_list.append(temp)
# href = sentence[i].get('href')
# href_list.append("https://so.gushiwen.org" + href)
return [href_list, sentence_list]
def save_txt(info_list):
import json
with open(r'sentence.txt', 'a', encoding='utf-8') as txt_file:
for element in info_list[1]:
txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(info_list):
subpage_urls = info_list[0]
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
sub_html = []
for url in subpage_urls:
html = page_request(url, ua)
sub_html.append(html)
return sub_html
# 子网页处理函数:解析子网页,爬取诗句内容
def sub_page_parse(sub_html):
poem_list = []
for html in sub_html:
soup = BeautifulSoup(html, 'lxml')
poem = soup.select('div.left > div.sons > div.cont > div.contson')
if len(poem) == 0:
continue
poem = poem[0].get_text()
poem_list.append(poem.strip())
return poem_list
# 子网页处理函数:保存诗句到txt
def sub_page_save(poem_list):
import json
with open(r'poems.txt', 'a', encoding='utf-8') as txt_file:
for element in poem_list:
txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
if __name__ == '__main__':
print("**************开始爬取古诗文网站********************")
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
poemCount = 0
for i in range(1, 5):
# todo 链接错误
# url = 'https://so.gushiwen.org/mingju/default.aspx?p=%d&c=&t=' % (i)
url = 'https://so.gushiwen.cn/mingjus/default.aspx?page=%d' % i
print(url)
# time.sleep(1)
html = page_request(url, ua)
info_list = page_parse(html)
save_txt(info_list)
# 开始处理子网页
print("开始解析第%d" % i + "页")
# 开始解析名句子网页
sub_html = sub_page_request(info_list)
poem_list = sub_page_parse(sub_html)
sub_page_save(poem_list)
poemCount += len(info_list[0])
print("****************爬取完成***********************")
print("共爬取%d" % poemCount + "个古诗词名句")
print("共爬取%d" % poemCount + "个古诗词")
web_demo.html
<html>
<head><title>搜索指数</title></head>
<body>
<table>
<tr><td>排名</td><td>关键词</td><td>搜索指数</td></tr>
<tr><td>1</td><td>大数据</td><td>187767</td></tr>
<tr><td>2</td><td>云计算</td><td>178856</td></tr>
<tr><td>3</td><td>物联网</td><td>122376</td></tr>
</table>
</body>
</html>
mysql > CREATE DATABASE webdb;
mysql > USE webdb;
mysql> create table search_index(
-> id int,
-> keyword char(20),
-> number int);
# html_to_mysql.py
import requests
from bs4 import BeautifulSoup
# 读取本地HTML文件
def get_html():
path = 'C:/web_demo.html'
htmlfile= open(path,'r')
html = htmlfile.read()
return html
# 解析HTML文件
def parse_html(html):
soup = BeautifulSoup(html,'html.parser')
all_tr=soup.find_all('tr')[1:]
all_tr_list = []
info_list = []
for i in range(len(all_tr)):
all_tr_list.append(all_tr[i])
for element in all_tr_list:
all_td=element.find_all('td')
all_td_list = []
for j in range(len(all_td)):
all_td_list.append(all_td[j].string)
#print(all_td_list)
info_list.append(all_td_list)
return info_list
# 保存数据库
def save_mysql(info_list):
import pymysql.cursors
# 连接数据库
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root', # 数据库用户名
passwd='123456', # 密码
db='webdb',
charset='utf8'
)
# 获取游标
cursor = connect.cursor()
# 插入数据
for item in info_list:
id = int(item[0])
keyword = item[1]
number = int(item[2])
sql = "INSERT INTO search_index(id,keyword,number) VALUES ('%d', '%s', %d)"
data = (id,keyword,number)
cursor.execute(sql % data)
connect.commit()
print('成功插入数据')
# 关闭数据库连接
connect.close()
if __name__ =='__main__':
html = get_html()
info_list = parse_html(html)
save_mysql(info_list)
<html>
<head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata software;and their names are
<a href="http://example.com/hadoop" class="hadoop" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="spark" id="link2">Spark</a>and
<a href="http://example.com/flink" class="flink" id="link3"><!--Flink--></a>;
and they are widely used in real application.</p>
<p class="bigdata">...</p>
</html>
> pip install lxml
>>> html_text = """
<html>
<body>
<head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata software;and their names are
<a href="http://example.com/hadoop" class="bigdata Hadoop" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="bigdata Spark" id="link2">Spark</a>and
<a href="http://example.com/flink" class="bigdata Flink" id="link3"><!--Flink--></a>;
and they are widely used in real application.</p>
<p class="bigdata">others</p>
<p>……</p>
</body>
</html>
"""
>>> from lxml import etree
>>> html = etree.HTML(html_text)
>>> html_data = html.xpath('body')
>>> print(html_data)
[<Element body at 0x1608dda2d80>]
>>> for element in html_data:
print(etree.tostring(element))
>>> html_data = html.xpath('/html/body/p/a')
>>> for element in html_data:
print(etree.tostring(element))
>>> html_data = html.xpath('//a')
>>> for element in html_data:
print(etree.tostring(element))
>>> html_data = html.xpath('//p/a[@class="bigdata Spark"]')
>>> for element in html_data:
print(etree.tostring(element))
>>> html_data = html.xpath('//body/p[@class="bigdata"]')
>>> for element in html_data:
print(etree.tostring(element))
>>> html = etree.HTML(html_text)
>>> html_data = html.xpath('//a[contains(@class, "bigdata")]/text()')
>>> print(html_data)
['Hadoop', 'Spark']
items.py
import scrapy
class PoemscrapyItem(scrapy.Item):
# 名句
sentence = scrapy.Field()
# 出处
source = scrapy.Field()
# 全文链接
url = scrapy.Field()
# 名句详细信息
content = scrapy.Field()
poemSpider.py
import scrapy
from scrapy import Request
from ..items import PoemscrapyItem
class PoemspiderSpider(scrapy.Spider):
name = 'poemSpider' # 用于区别不同的爬虫
allowed_domains = ['gushiwen.cn'] # 允许访问的域
start_urls = ['http://so.gushiwen.cn/mingjus/'] # 爬取的地址
def parse(self, response):
# 先获每句名句的div
for box in response.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]/div'):
# 获取每句名句的链接
url = 'https://so.gushiwen.cn' + box.xpath('.//@href').get()
# 获取每句名句内容
sentence = box.xpath('.//a[1]/text()') .get()
# 获取每句名句出处
source = box.xpath('.//a[2]/text()') .get()
# 实例化容器
item = PoemscrapyItem()
# 将收集到的信息封装起来
item['url'] = url
item['sentence'] = sentence
item['source'] = source
# 处理子页
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_detail)
# 翻页
next = response.xpath('//a[@class="amore"]/@href'). get()
if next is not None:
next_url = 'https://so.gushiwen.cn' + next
# 处理下一页内容
yield Request(next_url)
def parse_detail(self, response):
# 获取名句的详细信息
item = response.meta['item']
content_list = response.xpath('//div[@class="contson"]//text()').getall()
content = "".join(content_list).strip().replace('\n', '').replace('\u3000', '')
item['content'] = content
yield item
pipelines.py
import json
class PoemscrapyPipeline:
def __init__(self):
# 打开文件
self.file = open('data.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
# 读取item中的数据
line = json.dumps(dict(item), ensure_ascii=False) + '\n'
# 写入文件
self.file.write(line)
return item
settings.py
BOT_NAME = 'poemScrapy'
SPIDER_MODULES = ['poemScrapy.spiders']
NEWSPIDER_MODULE = 'poemScrapy.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4421.5 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 设置日志打印的等级
LOG_LEVEL = 'WARNING'
ITEM_PIPELINES = {
'poemScrapy.pipelines.PoemscrapyPipeline': 1,
}
from scrapy import cmdline
cmdline.execute("scrapy crawl poemSpider".split())
CREATE DATABASE poem;
DROP TABLE IF EXISTS `beautifulsentence`;
CREATE TABLE `beautifulsentence` (
`source` varchar(255) NOT NULL,
`sentence` varchar(255) NOT NULL,
`content` text NOT NULL,
`url` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
pipelines.py
from itemadapter import ItemAdapter
import json
import pymysql
class PoemscrapyPipeline:
def __init__(self):
# 连接MySQL数据库
self.connect = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='123456', #设置成用户自己的数据库密码
db='poem',
charset='utf8'
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
# 写入数据库
self.cursor.execute('INSERT INTO beautifulsentence(source,sentence,content,url) VALUES ("{}","{}","{}","{}")'.format(item['source'], item['sentence'], item['content'], item['url']))
self.connect.commit()
return item
def close_spider(self, spider):
# 关闭数据库连接
self.cursor.close()
self.connect.close()
USE poem;
SELECT * FROM beautifulsentence;