爬取厦大计算机系网站首页新闻

大数据学习路线图

参考教材:林子雨编著《数据采集与预处理》(教材官网
任务:爬取厦门大学计算机系网站首页新闻内容

代码:

# --*-- encoding: utf-8 --*--
# @ModuleName: crawlNews
# @Function: 
# @Author: dblab
# @Time: 2022/10/29 15:32

import requests
from bs4 import BeautifulSoup
import time

# 函数1:请求网页
def page_request(url, ua):
    response = requests.get(url, headers=ua)
    html = response.content.decode('utf-8')
    return html

# 函数2:解析网页
def page_parse(html):
    soup = BeautifulSoup(html, 'lxml')     
    info = soup.select('#block-system-main > div > div > div > div.center-wrapper > div.panel-panel.panel-col > div > div.panel-pane.pane-views.pane-xinwendongt.center-first.blankLinks > div > div.pane-content.content > div > div > div > ul > li > span.views-field.views-field-title > span > a')    
    #print(info)
    title_list = []
    href_list = []
    for i in range(len(info)):
        curInfo = info[i]        
        title = curInfo.get_text()
        title_list.append(title)
        href = curInfo.get('href')
        href_list.append("http://cs.xmu.edu.cn/"+href)

    #print(sentence_list)
    #print(href_list)
    return [href_list, title_list]

def save_txt(info_list):
    import json
    with open(r'title.txt', 'w', encoding='utf-8') as txt_file:
        for element in info_list[1]:
            #txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
            txt_file.write(element + '\n\n')

# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(info_list):
    subpage_urls = info_list[0]
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
    sub_html = []
    for url in subpage_urls:
        html = page_request(url, ua)
        sub_html.append(html)
    return sub_html

# 子网页处理函数:解析子网页,爬取新闻内容
def sub_page_parse(sub_html):
    news_list = []
    for html in sub_html:
        soup = BeautifulSoup(html, 'lxml')
        title = soup.select('#content-region-inner > h1')
        news = soup.select('#node-154 > div > div.content > div')        
        if len(news) == 0:
            continue        
        news_content = news[0].get_text()
        #print(title[0].get_text().split('\n')[0])
        #print(news[0].get_text())
        news_list.append(title[0].get_text()+news_content)
    #print(news_list[0])
    return news_list

# 子网页处理函数:保存新闻到txt
def sub_page_save(news_list):
    import json
    with open(r'news.txt', 'w', encoding='utf-8') as txt_file:
        for element in news_list:
            #txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
            txt_file.write(element+ '\n\n')

if __name__ == '__main__':
    print("**************开始爬取厦门大学计算机系网站********************")
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
    url = 'https://cs.xmu.edu.cn/'
    print(url)

    html = page_request(url, ua)
    info_list = page_parse(html)
    save_txt(info_list)
    sub_html = sub_page_request(info_list)
    news_list = sub_page_parse(sub_html)
    sub_page_save(news_list)

    print("****************爬取完成***********************")

上面代码还可以继续改进,由郑海山老师改进以后的代码如下:

# --*-- encoding: utf-8 --*--
# @ModuleName: crawlNews
# @Function:
# @Author: dblab
# @Time: 2022/10/29 15:32

# import json
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

START_URL = "https://cs.xmu.edu.cn/"

# 函数1:请求网页
def page_request(url, ua):
    response = requests.get(url, headers=ua)
    response.encoding = response.apparent_encoding
    return response.content

# 函数2:解析网页
def page_parse(html):
    soup = BeautifulSoup(html, "lxml")
    info = soup.select(".view-xinwendongt a")

    title_list = []
    href_list = []
    for curInfo in info:
        title = curInfo.get_text()
        title_list.append(title)
        href = curInfo.get("href")
        href_list.append(urljoin(START_URL, href))

    return (href_list, title_list)

def save_title_to_txt(href_title_list):
    with open(r"title.txt", "wt", encoding="utf-8") as txt_file:
        for element in href_title_list[1]:
            txt_file.write(element + "\n\n")

# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(href_title_list, ua):
    subpage_urls = href_title_list[0]

    sub_html = []
    for url in subpage_urls:
        html = page_request(url, ua)
        sub_html.append(html)

    return sub_html

# 子网页处理函数:解析子网页,爬取新闻内容
def sub_page_parse(sub_html):
    news_list = []
    for html in sub_html:
        soup = BeautifulSoup(html, "lxml")
        title = soup.select("#content-region-inner > h1")
        news = soup.select(".v_news_content")
        if not news:
            continue

        news_content = news[0].get_text()
        news_list.append(title[0].get_text() + news_content)

    return news_list

# 子网页处理函数:保存新闻到txt
def sub_page_save(news_list):
    with open(r"news.txt", "wt", encoding="utf-8") as txt_file:
        for element in news_list:
            txt_file.write(element + "\n\n")

if __name__ == "__main__":
    print(f"{'开始爬取厦门大学计算机系网站':*^{60}}")

    ua = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36"
    }
    url = START_URL
    print(url)

    html = page_request(url, ua)
    href_title_list = page_parse(html)
    save_title_to_txt(href_title_list)

    sub_html = sub_page_request(href_title_list, ua)
    news_list = sub_page_parse(sub_html)
    sub_page_save(news_list)

    print(f"{'爬取完成':*^70}")