参考教材:林子雨编著《数据采集与预处理》(教材官网)
任务:爬取厦门大学计算机系网站首页新闻内容
代码:
# --*-- encoding: utf-8 --*--
# @ModuleName: crawlNews
# @Function:
# @Author: dblab
# @Time: 2022/10/29 15:32
import requests
from bs4 import BeautifulSoup
import time
# 函数1:请求网页
def page_request(url, ua):
response = requests.get(url, headers=ua)
html = response.content.decode('utf-8')
return html
# 函数2:解析网页
def page_parse(html):
soup = BeautifulSoup(html, 'lxml')
info = soup.select('#block-system-main > div > div > div > div.center-wrapper > div.panel-panel.panel-col > div > div.panel-pane.pane-views.pane-xinwendongt.center-first.blankLinks > div > div.pane-content.content > div > div > div > ul > li > span.views-field.views-field-title > span > a')
#print(info)
title_list = []
href_list = []
for i in range(len(info)):
curInfo = info[i]
title = curInfo.get_text()
title_list.append(title)
href = curInfo.get('href')
href_list.append("http://cs.xmu.edu.cn/"+href)
#print(sentence_list)
#print(href_list)
return [href_list, title_list]
def save_txt(info_list):
import json
with open(r'title.txt', 'w', encoding='utf-8') as txt_file:
for element in info_list[1]:
#txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
txt_file.write(element + '\n\n')
# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(info_list):
subpage_urls = info_list[0]
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
sub_html = []
for url in subpage_urls:
html = page_request(url, ua)
sub_html.append(html)
return sub_html
# 子网页处理函数:解析子网页,爬取新闻内容
def sub_page_parse(sub_html):
news_list = []
for html in sub_html:
soup = BeautifulSoup(html, 'lxml')
title = soup.select('#content-region-inner > h1')
news = soup.select('#node-154 > div > div.content > div')
if len(news) == 0:
continue
news_content = news[0].get_text()
#print(title[0].get_text().split('\n')[0])
#print(news[0].get_text())
news_list.append(title[0].get_text()+news_content)
#print(news_list[0])
return news_list
# 子网页处理函数:保存新闻到txt
def sub_page_save(news_list):
import json
with open(r'news.txt', 'w', encoding='utf-8') as txt_file:
for element in news_list:
#txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')
txt_file.write(element+ '\n\n')
if __name__ == '__main__':
print("**************开始爬取厦门大学计算机系网站********************")
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
url = 'https://cs.xmu.edu.cn/'
print(url)
html = page_request(url, ua)
info_list = page_parse(html)
save_txt(info_list)
sub_html = sub_page_request(info_list)
news_list = sub_page_parse(sub_html)
sub_page_save(news_list)
print("****************爬取完成***********************")
上面代码还可以继续改进,由郑海山老师改进以后的代码如下:
# --*-- encoding: utf-8 --*--
# @ModuleName: crawlNews
# @Function:
# @Author: dblab
# @Time: 2022/10/29 15:32
# import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
START_URL = "https://cs.xmu.edu.cn/"
# 函数1:请求网页
def page_request(url, ua):
response = requests.get(url, headers=ua)
response.encoding = response.apparent_encoding
return response.content
# 函数2:解析网页
def page_parse(html):
soup = BeautifulSoup(html, "lxml")
info = soup.select(".view-xinwendongt a")
title_list = []
href_list = []
for curInfo in info:
title = curInfo.get_text()
title_list.append(title)
href = curInfo.get("href")
href_list.append(urljoin(START_URL, href))
return (href_list, title_list)
def save_title_to_txt(href_title_list):
with open(r"title.txt", "wt", encoding="utf-8") as txt_file:
for element in href_title_list[1]:
txt_file.write(element + "\n\n")
# 子网页处理函数:进入并解析子网页/请求子网页
def sub_page_request(href_title_list, ua):
subpage_urls = href_title_list[0]
sub_html = []
for url in subpage_urls:
html = page_request(url, ua)
sub_html.append(html)
return sub_html
# 子网页处理函数:解析子网页,爬取新闻内容
def sub_page_parse(sub_html):
news_list = []
for html in sub_html:
soup = BeautifulSoup(html, "lxml")
title = soup.select("#content-region-inner > h1")
news = soup.select(".v_news_content")
if not news:
continue
news_content = news[0].get_text()
news_list.append(title[0].get_text() + news_content)
return news_list
# 子网页处理函数:保存新闻到txt
def sub_page_save(news_list):
with open(r"news.txt", "wt", encoding="utf-8") as txt_file:
for element in news_list:
txt_file.write(element + "\n\n")
if __name__ == "__main__":
print(f"{'开始爬取厦门大学计算机系网站':*^{60}}")
ua = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36"
}
url = START_URL
print(url)
html = page_request(url, ua)
href_title_list = page_parse(html)
save_title_to_txt(href_title_list)
sub_html = sub_page_request(href_title_list, ua)
news_list = sub_page_parse(sub_html)
sub_page_save(news_list)
print(f"{'爬取完成':*^70}")