编写爬虫爬取图书网站数据

大数据学习路线图

作者:厦门大学计算机系林子雨副教授
说明:本博客是与林子雨编著《数据采集与预处理》教材配套的教学资料。
操作系统:Ubuntu22.04
编程语言:Python3.10

爬取一个数据量相对大一点的网站,网站链接为:https://spa5.scrape.center/
这是一个图书网站,整个网站有数千本图书信息,网站数据是JavaScript渲染得到。
下面是爬虫代码:

import json
import pandas as pd
import requests
import urllib3

# 初始化
name, authors, url, content, theme, id = [], [], [], [], [], []
headers={'User-Agent':'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10'}
urllib3.disable_warnings()

RECORDS_PER_PAGE = 10
page = 0

while page<3:  
    offset = page * RECORDS_PER_PAGE
    the_url = f'https://spa5.scrape.center/api/book/?limit={RECORDS_PER_PAGE}&offset={offset}'
    response = requests.get(the_url, headers=headers, verify=False)
    data = json.loads(response.text)

    # 如果没有结果,跳出循环
    if not data['results']:
        break
    for i in range(len(data['results'])):
        book_id = data['results'][i]['id']
        book_name = data['results'][i]['name']
        book_authors = str(data['results'][i]['authors']).replace('[', '').replace("\\n", "").replace("'", "").replace(']', '').strip()

        # 打印爬取的内容
        print(f"ID: {book_id}, Name: {book_name}, Authors: {book_authors}")

        id.append(book_id)
        name.append(book_name)
        authors.append(book_authors)

        detail_url = 'https://spa5.scrape.center/api/book/' + book_id + '/'
        detail_response = requests.get(detail_url, headers=headers, verify=False)
        detail_data = json.loads(detail_response.text)
        book_url = detail_data['url']
        book_theme = str(detail_data['tags']).replace('[', '').replace("'", "").replace(']', '').strip()

        # 打印详细信息的内容
        print(f"URL: {book_url}, Theme: {book_theme}\n")

        url.append(book_url)
        theme.append(book_theme)       

    page += 1  

bt = {
    "链接": url,
    "书名": name,
    "作者": authors,
    "主题": theme,
}
work = pd.DataFrame(bt)
work.to_csv('books_list.txt', sep='\t', index=False)