代码-第3章 实验环境搭建-林子雨编著《数据采集与预处理》

大数据学习路线图

林子雨编著《数据采集与预处理》教材配套代码(教材官网
查看所有章节代码

第3章 网络数据采集

web_demo.html

<html>
<head><title>搜索指数</title></head>
<body>
<table>
<tr><td>排名</td><td>关键词</td><td>搜索指数</td></tr>
<tr><td>1</td><td>大数据</td><td>187767</td></tr>
<tr><td>2</td><td>云计算</td><td>178856</td></tr>
<tr><td>3</td><td>物联网</td><td>122376</td></tr>
</table>
</body>
</html>
>>> import urllib.request
>>> response=urllib.request.urlopen("http://www.baidu.com")
>>> html=response.read()
>>> print(html)
>>> import urllib.parse
>>> import urllib.request
>>> # 1.指定url
>>> url = 'https://fanyi.baidu.com/sug'
>>> # 2.发起POST请求之前,要处理POST请求携带的参数
>>> # 2.1 将POST请求封装到字典
>>> data = {'kw':'苹果',}
>>> # 2.2 使用parse模块中的urlencode(返回值类型是字符串类型)进行编码处理
>>> data = urllib.parse.urlencode(data)
>>> # 将步骤2.2的编码结果转换成byte类型
>>> data = data.encode()
>>> # 3.发起POST请求:urlopen函数的data参数表示的就是经过处理之后的POST请求携带的参数
>>> response = urllib.request.urlopen(url=url,data=data)
>>> data = response.read()
>>> print(data)
b'{"errno":0,"data":[{"k":"\\u82f9\\u679c","v":"\\u540d. apple"},{"k":"\\u82f9\\u679c\\u56ed","v":"apple grove"},{"k":"\\u82f9\\u679c\\u5934","v":"apple head"},{"k":"\\u82f9\\u679c\\u5e72","v":"[\\u533b]dried apple"},{"k":"\\u82f9\\u679c\\u6728","v":"applewood"}]}'
 > pip install urllib3
>>> import urllib3
>>> #需要一个PoolManager实例来生成请求,由该实例对象处理与线程池的连接以及线程安全的所有细节,不需要任何人为操作
>>> http = urllib3.PoolManager()
>>> response = http.request('GET','http://www.baidu.com')
>>> print(response.status)
>>> print(response.data)
>>> import urllib3
>>> http = urllib3.PoolManager()
>>> response = http.request('POST',
            'https://fanyi.baidu.com/sug'
            ,fields={'kw':'苹果',})
>>> print(response.data)
> pip install requests
>>> import requests
>>> response = requests.get('http://www.baidu.com')  #对需要爬取的网页发送请求
>>> print('状态码:',response.status_code)  #打印状态码
>>> print('url:',response.url)  #打印请求url
>>> print('header:',response.headers)  #打印头部信息
>>> print('cookie:',response.cookies)  #打印cookie信息
>>> print('text:',response.text)  #以文本形式打印网页源码
>>> print('content:',response.content)  #以字节流形式打印网页源码
>>> import requests
>>> #导入模块
>>> import requests
>>> #表单参数
>>> data = {'kw':'苹果',}
>>> #对需要爬取的网页发送请求
>>> response = requests.post('https://fanyi.baidu.com/sug',data=data)
>>> #以字节流形式打印网页源码
>>> print(response.content)
>>> import requests
>>> base_url = 'http://httpbin.org'
>>> param_data = {'user':'xmu','password':'123456'}
>>> response = requests.get(base_url+'/get',params=param_data)
>>> print(response.url)
http://httpbin.org/get?user=xmu&password=123456
>>> print(response.status_code)
200
>>> import requests
>>> url='http://httpbin.org'
>>> # 创建头部信息
>>> headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
>>> response = requests.get(url,headers=headers)
>>> print(response.content)

time_out.py

# time_out.py
import requests
from requests.exceptions import ReadTimeout,ConnectTimeout
try:
   response = requests.get("http://www.baidu.com", timeout=0.5)
   print(response.status_code)
except ReadTimeout or ConnectTimeout:
   print('Timeout')
> pip install lxml
>>> html_doc = """
<html><head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a href="http://example.com/hadoop" class="software" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="software" id="link2">Spark</a> and
<a href="http://example.com/flink" class="software" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
"""
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(html_doc,"lxml")
>>> content = soup.prettify()
>>> print(content)
<html>
 <head>
  <title>
   BigData Software
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    BigData Software
   </b>
  </p>
  <p class="bigdata">
   There are three famous bigdata softwares; and their names are
   <a class="software" href="http://example.com/hadoop" id="link1">
    Hadoop
   </a>
   ,
   <a class="software" href="http://example.com/spark" id="link2">
    Spark
   </a>
   and
   <a class="software" href="http://example.com/flink" id="link3">
    Flink
   </a>
   ;
and they are widely used in real applications.
  </p>
  <p class="bigdata">
   ...
  </p>
 </body>
</html>
soup = BeautifulSoup(html_doc,"html.parser")
>>> print(soup.a)
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>
>>> print(soup.title)
<title>BigData Software</title>
>>> print(soup.name)
[document]
>>> print(soup.p.attrs)
{'class': ['title']}
>>> print(soup.p['class'])
['title']
>>> print(soup.p.get('class'))
['title']
>>> print(soup.p.string)
BigData Software
>>> print(type(soup.p.string))
<class 'bs4.element.NavigableString'>
>>> print(type(soup.name))
<class 'str'>
>>> print(soup.name)
[document]
>>> print(soup.attrs)
{}

bs4_example.py

# bs4_example.py
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,"lxml")
print(soup.a)
print(soup.a.string)
print(type(soup.a.string))
>>> html_doc = """
<html><head><title>BigData Software</title></head>
<p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a href="http://example.com/hadoop" class="software" id="link1">Hadoop</a>,
<a href="http://example.com/spark" class="software" id="link2">Spark</a> and
<a href="http://example.com/flink" class="software" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
"""
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(html_doc,"lxml")
>>> print(soup.body.contents)
[<p class="title"><b>BigData Software</b></p>, '\n', <p class="bigdata">There are three famous bigdata softwares; and their names are
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>,
<a class="software" href="http://example.com/spark" id="link2">Spark</a> and
<a class="software" href="http://example.com/flink" id="link3">Flink</a>;
and they are widely used in real applications.</p>, '\n', <p class="bigdata">...</p>, '\n']
>>> print(soup.body.contents[0])
<p class="title"><b>BigData Software</b></p>
>>> for child in soup.body.children:
      print(child)
>>>  for child in soup.descendants:
            print(child)
>>> print(soup.title)
<title>BigData Software</title>
>>> print(soup.title.string)
BigData Software
>>> print(soup.head)
<head><title>BigData Software</title></head>
>>> print(soup.head.string)
BigData Software
>>> print(soup.body)
<body><p class="title"><b>BigData Software</b></p>
<p class="bigdata">There are three famous bigdata softwares; and their names are
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>,
<a class="software" href="http://example.com/spark" id="link2">Spark</a> and
<a class="software" href="http://example.com/flink" id="link3">Flink</a>;
and they are widely used in real applications.</p>
<p class="bigdata">...</p>
</body>
>>> print(soup.body.string)
None
>>> print(soup.strings)
<generator object Tag._all_strings at 0x0000000002C4D190>
>>> for string in soup.strings:
      print(repr(string))
>>> for string in soup.stripped_strings:
          print(string)
>>> p = soup.p
>>> print(p.parent.name)
Body
>>> content = soup.head.title.string
>>> print(content)
BigData Software
>>> print(content.parent.name)
title
>>> content = soup.head.title.string
>>> print(content)
BigData Software
>>> for parent in content.parents:
      print(parent.name)
>>> print(soup.p.next_sibling)
>>> print(soup.p.prev_sibling)
None  #没有前一个兄弟节点,返回None
>>> print(soup.p.next_sibling.next_sibling)
>>> for next in soup.a.next_siblings:
      print(repr(next))
>>> print(soup.head.next_element)
<title>BigData Software</title>
>>> for element in soup.a.next_elements:
      print(repr(element))
>>> print(soup.find_all('a'))
    [<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> import re
>>> for tag in soup.find_all(re.compile("^b")):
      print(tag)
>>> print(soup.find_all(["a","b"]))
>>> print(soup.find_all(id=True))
>>> def has_class_but_no_id(tag): 
         return tag.has_attr('class') and not tag.has_attr('id')
>>> print(soup.find_all(has_class_but_no_id))
>>> import re
>>> print(soup.find_all(id='link2'))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.find_all(href=re.compile("spark")))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> soup.find_all(href=re.compile("hadoop"), id='link1')
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.find_all(class_="software"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> import re
>>> print(soup.a)
<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>
>>> print(soup.find_all(text="Hadoop"))
['Hadoop']
>>> print(soup.find_all(text=["Hadoop", "Spark", "Flink"]))
['Hadoop', 'Spark', 'Flink']
>>> print(soup.find_all(text="bigdata"))
[]
>>> print(soup.find_all(text="BigData Software"))
['BigData Software', 'BigData Software']
>>> print(soup.find_all(text=re.compile("bigdata")))
['There are three famous bigdata softwares; and their names are\n']
>>> print(soup.find_all("a"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.find_all("a",limit=2))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.body.find_all("a",recursive=False))
[]
>>> print(soup.select('title'))
[<title>BigData Software</title>]
>>> print(soup.select('a'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('b'))
[<b>BigData Software</b>]
>>> print(soup.select('.software'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('#link1'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select('p #link1'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select("head > title"))
[<title>BigData Software</title>]
>>> print(soup.select("p > a:nth-of-type(1)"))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(soup.select("p > a:nth-of-type(2)"))
[<a class="software" href="http://example.com/spark" id="link2">Spark</a>]
>>> print(soup.select("p > a:nth-of-type(3)"))
[<a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('a[class="software"]'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>, <a class="software" href="http://example.com/spark" id="link2">Spark</a>, <a class="software" href="http://example.com/flink" id="link3">Flink</a>]
>>> print(soup.select('a[href="http://example.com/hadoop"]'))
[<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
    >>> print(soup.select('p a[href="http://example.com/hadoop"]'))
    [<a class="software" href="http://example.com/hadoop" id="link1">Hadoop</a>]
>>> print(type(soup.select('title')))
<class 'bs4.element.ResultSet'>
>>> print(soup.select('title')[0].get_text())
BigData Software
>>> for title in soup.select('title'):
      print(title.get_text())
# baidu_hot.py
import requests
from bs4 import BeautifulSoup

# 请求网页
def request_page(url,headers):
    response = requests.get(url,headers=headers)
    response.encoding = response.apparent_encoding 
    return response.text

# 解析网页
def parse_page(html):
    soup = BeautifulSoup(html,'html.parser')
    all_topics=soup.find_all('tr')[1:]
    for each_topic in all_topics:        
        topic_times = each_topic.find('td',class_='last')    #搜索指数
        topic_rank = each_topic.find('td',class_='first')    #排名
        topic_name = each_topic.find('td',class_='keyword')  #标题
        if topic_rank != None and topic_name!=None and topic_times!=None:
            topic_rank = each_topic.find('td',class_='first').get_text().replace(' ','').replace('\n','')
            topic_name = each_topic.find('td',class_='keyword').get_text().replace(' ','').replace('\n','')
            topic_times = each_topic.find('td',class_='last').get_text().replace(' ','').replace('\n','')
            tplt = "排名:{0:^4}\t标题:{1:{3}^15}\t热度:{2:^8}"
            print(tplt.format(topic_rank,topic_name,topic_times,chr(12288)))

if __name__=='__main__':
    url = 'http://top.baidu.com/buzz?b=1&fr=20811'
    headers = {'User-Agent':'Mozilla/5.0'}
    html = request_page(url,headers)
    parse_page(html)

教材中的parse_poem.py代码已经无法运行,请使用下面最新代码。

# parse_poem.py
# --*-- encoding: utf-8 --*--
# @ModuleName: parse_poem
# @Function: 
# @Author: dblab
# @Time: 2022/10/25 14:36

import requests
from bs4 import BeautifulSoup
import time

# 函数1:请求网页
def page_request(url, ua):
    response = requests.get(url, headers=ua)
    html = response.content.decode('utf-8')
    return html

# 函数2:解析网页
def page_parse(html):
    soup = BeautifulSoup(html, 'lxml')
    title = soup('title')
    # 诗句内容:诗句+出处+链接
    info = soup.select('body > div.main3 > div.left > div.sons > div.cont')
    # 诗句链接
    sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')
    sentence_list = []
    href_list = []

    for i in range(len(info)):
        curInfo = info[i]
        poemInfo = ''
        poemInfo = poemInfo.join(curInfo.get_text().split('\n'))
        sentence_list.append(poemInfo)

        href = sentence[i].get('href')
        href_list.append("https://so.gushiwen.org" + href)

    # todo sentence 和 poet数量可能不符
    # sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')
    # poet = soup.select('div.left > div.sons > div.cont > a:nth-of-type(2)')
    # for i in range(len(sentence)):
    #     temp = sentence[i].get_text() + "---" + poet[i].get_text()
    #     sentence_list.append(temp)
    #     href = sentence[i].get('href')
    #     href_list.append("https://so.gushiwen.org" + href)

    return [href_list, sentence_list]

def save_txt(info_list):
    import json
    with open(r'sentence.txt', 'a', encoding='utf-8') as txt_file:
        for element in info_list[1]:
            txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')

# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(info_list):
    subpage_urls = info_list[0]
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
    sub_html = []
    for url in subpage_urls:
        html = page_request(url, ua)
        sub_html.append(html)
    return sub_html

# 子网页处理函数:解析子网页,爬取诗句内容
def sub_page_parse(sub_html):
    poem_list = []
    for html in sub_html:
        soup = BeautifulSoup(html, 'lxml')
        poem = soup.select('div.left > div.sons > div.cont > div.contson')
        if len(poem) == 0:
            continue
        poem = poem[0].get_text()
        poem_list.append(poem.strip())
    return poem_list

# 子网页处理函数:保存诗句到txt
def sub_page_save(poem_list):
    import json
    with open(r'poems.txt', 'a', encoding='utf-8') as txt_file:
        for element in poem_list:
            txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')

if __name__ == '__main__':
    print("**************开始爬取古诗文网站********************")
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
    poemCount = 0
    for i in range(1, 5):
        # todo 链接错误
        # url = 'https://so.gushiwen.org/mingju/default.aspx?p=%d&c=&t=' % (i)

        url = 'https://so.gushiwen.cn/mingjus/default.aspx?page=%d' % i
        print(url)
        # time.sleep(1)
        html = page_request(url, ua)
        info_list = page_parse(html)
        save_txt(info_list)
        # 开始处理子网页
        print("开始解析第%d" % i + "页")
        # 开始解析名句子网页
        sub_html = sub_page_request(info_list)
        poem_list = sub_page_parse(sub_html)
        sub_page_save(poem_list)

        poemCount += len(info_list[0])

    print("****************爬取完成***********************")
    print("共爬取%d" % poemCount + "个古诗词名句")
    print("共爬取%d" % poemCount + "个古诗词")

web_demo.html

<html>
<head><title>搜索指数</title></head>
<body>
<table>
<tr><td>排名</td><td>关键词</td><td>搜索指数</td></tr>
<tr><td>1</td><td>大数据</td><td>187767</td></tr>
<tr><td>2</td><td>云计算</td><td>178856</td></tr>
<tr><td>3</td><td>物联网</td><td>122376</td></tr>
</table>
</body>
</html>
mysql > CREATE DATABASE webdb;
mysql > USE webdb;
mysql> create table search_index(
    -> id int,
    -> keyword char(20),
    -> number int);
# html_to_mysql.py
import requests
from bs4 import BeautifulSoup

# 读取本地HTML文件
def get_html():
    path = 'C:/web_demo.html'
    htmlfile= open(path,'r')
    html = htmlfile.read()
    return html

# 解析HTML文件
def parse_html(html):
    soup = BeautifulSoup(html,'html.parser')
    all_tr=soup.find_all('tr')[1:]
    all_tr_list = []
    info_list = []
    for i in range(len(all_tr)):
        all_tr_list.append(all_tr[i])
    for element in all_tr_list:
        all_td=element.find_all('td')
        all_td_list = []
        for j in range(len(all_td)):
            all_td_list.append(all_td[j].string)
        #print(all_td_list)
        info_list.append(all_td_list)
    return info_list

# 保存数据库
def save_mysql(info_list):
    import pymysql.cursors
    # 连接数据库
    connect = pymysql.Connect(
        host='localhost',
        port=3306,
        user='root',  # 数据库用户名
        passwd='123456',  # 密码
        db='webdb',
        charset='utf8'
    )

    # 获取游标
    cursor = connect.cursor()

    # 插入数据
    for item in info_list:
        id = int(item[0])
        keyword = item[1]
        number = int(item[2])
        sql = "INSERT INTO search_index(id,keyword,number) VALUES ('%d', '%s', %d)"
        data = (id,keyword,number)
        cursor.execute(sql % data)
        connect.commit()
    print('成功插入数据')

    # 关闭数据库连接
    connect.close()

if __name__ =='__main__':
    html = get_html()
    info_list = parse_html(html)
    save_mysql(info_list)
<html>
    <head><title>BigData Software</title></head>
    <p class="title"><b>BigData Software</b></p>
    <p class="bigdata">There are three famous bigdata software;and their names are
        <a href="http://example.com/hadoop" class="hadoop" id="link1">Hadoop</a>,
        <a href="http://example.com/spark" class="spark" id="link2">Spark</a>and
        <a href="http://example.com/flink" class="flink" id="link3"><!--Flink--></a>;
        and they are widely used in real application.</p>
    <p class="bigdata">...</p>
</html>
> pip install lxml
>>> html_text = """
<html>
  <body>
    <head><title>BigData Software</title></head>
    <p class="title"><b>BigData Software</b></p>
    <p class="bigdata">There are three famous bigdata software;and their names are
      <a href="http://example.com/hadoop" class="bigdata Hadoop" id="link1">Hadoop</a>,
      <a href="http://example.com/spark" class="bigdata Spark" id="link2">Spark</a>and
      <a href="http://example.com/flink" class="bigdata Flink" id="link3"><!--Flink--></a>;
        and they are widely used in real application.</p>
    <p class="bigdata">others</p>
    <p>……</p>
  </body>
</html>
"""
>>> from lxml import etree
>>> html = etree.HTML(html_text)
>>> html_data = html.xpath('body')
>>> print(html_data)
[<Element body at 0x1608dda2d80>]
>>> for element in html_data:
    print(etree.tostring(element))
>>> html_data = html.xpath('/html/body/p/a')
>>> for element in html_data:
        print(etree.tostring(element))
>>> html_data = html.xpath('//a')
>>> for element in html_data:
        print(etree.tostring(element))
>>> html_data = html.xpath('//p/a[@class="bigdata Spark"]')
>>> for element in html_data:
        print(etree.tostring(element))
>>> html_data = html.xpath('//body/p[@class="bigdata"]')
>>> for element in html_data:
     print(etree.tostring(element))
>>> html = etree.HTML(html_text)
>>> html_data = html.xpath('//a[contains(@class, "bigdata")]/text()')
>>> print(html_data)
['Hadoop', 'Spark']

items.py

import scrapy

class PoemscrapyItem(scrapy.Item):
    # 名句
    sentence = scrapy.Field()
    # 出处
    source = scrapy.Field()
    # 全文链接
    url = scrapy.Field()
    # 名句详细信息
    content = scrapy.Field()

poemSpider.py

import scrapy
from scrapy import Request
from ..items import PoemscrapyItem

class PoemspiderSpider(scrapy.Spider):
    name = 'poemSpider'   # 用于区别不同的爬虫
    allowed_domains = ['gushiwen.cn']  # 允许访问的域
    start_urls = ['http://so.gushiwen.cn/mingjus/']  # 爬取的地址

    def parse(self, response):
        # 先获每句名句的div
        for box in response.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]/div'):
            # 获取每句名句的链接
            url = 'https://so.gushiwen.cn' + box.xpath('.//@href').get()
            # 获取每句名句内容
            sentence = box.xpath('.//a[1]/text()') .get()
            # 获取每句名句出处
            source = box.xpath('.//a[2]/text()') .get()
            # 实例化容器
            item = PoemscrapyItem()
            # 将收集到的信息封装起来
            item['url'] = url
            item['sentence'] = sentence
            item['source'] = source
            # 处理子页
            yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_detail)
        # 翻页
        next = response.xpath('//a[@class="amore"]/@href'). get()
        if next is not None:
            next_url = 'https://so.gushiwen.cn' + next
            # 处理下一页内容
            yield Request(next_url)

    def parse_detail(self, response):
        # 获取名句的详细信息
        item = response.meta['item']
        content_list = response.xpath('//div[@class="contson"]//text()').getall()
        content = "".join(content_list).strip().replace('\n', '').replace('\u3000', '')
        item['content'] = content
        yield item

pipelines.py

import json

class PoemscrapyPipeline:
    def __init__(self):
        # 打开文件
        self.file = open('data.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        # 读取item中的数据
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # 写入文件
        self.file.write(line)
        return item

settings.py

BOT_NAME = 'poemScrapy'

SPIDER_MODULES = ['poemScrapy.spiders']
NEWSPIDER_MODULE = 'poemScrapy.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4421.5 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 设置日志打印的等级
LOG_LEVEL = 'WARNING'

ITEM_PIPELINES = {
    'poemScrapy.pipelines.PoemscrapyPipeline': 1,
}
from scrapy import cmdline
cmdline.execute("scrapy crawl poemSpider".split())
CREATE DATABASE poem;
DROP TABLE IF EXISTS `beautifulsentence`;
CREATE TABLE `beautifulsentence` (
  `source` varchar(255) NOT NULL,
  `sentence` varchar(255) NOT NULL,
  `content` text NOT NULL,
  `url` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

pipelines.py

from itemadapter import ItemAdapter
import json
import pymysql

class PoemscrapyPipeline:
    def __init__(self):
        # 连接MySQL数据库
        self.connect = pymysql.connect(
            host='localhost',
            port=3306,
            user='root',
            passwd='123456',  #设置成用户自己的数据库密码
            db='poem',
            charset='utf8'
        )
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        # 写入数据库
        self.cursor.execute('INSERT INTO beautifulsentence(source,sentence,content,url) VALUES ("{}","{}","{}","{}")'.format(item['source'], item['sentence'], item['content'], item['url']))
        self.connect.commit()
        return item
def close_spider(self, spider):
    # 关闭数据库连接
        self.cursor.close()
        self.connect.close()
USE poem;
SELECT * FROM beautifulsentence;