MongoDB爬蟲實(shí)踐:爬取虎撲論壇
網(wǎng)站地址為:https://bbs.hupu.com/bxj
1.網(wǎng)站分析
首先,定位網(wǎng)頁上帖子名稱、帖子鏈接、作者、作者鏈接、創(chuàng)建時(shí)間、回復(fù)數(shù)目、瀏覽數(shù)目、最后回復(fù)用戶、最后回復(fù)時(shí)間等信息的位置,之后,我們使用BeautifulSoup在網(wǎng)頁中定位這些。
數(shù)據(jù) | 位置 |
---|---|
某帖子所有數(shù)據(jù) | ‘li’ |
帖子名稱 | div?class="titlelink box"? ?>? a |
帖子鏈接 | div?class="titlelink box"? ?>? a['href'] |
作者 | div class="author box"? ?>? a |
作者鏈接 | div class="author box"? ?>? a['href'] |
創(chuàng)建時(shí)間 | div class="author box"? ?>? contents[5] |
回復(fù)數(shù) | span class="ansour box"? |
瀏覽數(shù) | span class="ansour box" |
最后回復(fù)用戶 | div? class="endreply box"? > span |
最后回復(fù)時(shí)間 | div? class="endreply box"? > a |
另外,當(dāng)打開第二頁時(shí),網(wǎng)頁的URL地址變成了https://bbs.hupu.com/bxj-2,以此類推。
2.項(xiàng)目實(shí)踐
首先嘗試獲取第一頁數(shù)據(jù),代碼如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@File : Save05.py
@Author: Xinzhe.Pang
@Date : 2019/7/10 0:14
@Desc :
"""
# 爬取虎撲論壇數(shù)據(jù) https://bbs.hupu.com/bxj
import requests
from bs4 import BeautifulSoup
import datetime
def get_page(link):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
r = requests.get(link, headers=headers)
html = r.content
html = html.decode('UTF-8')
soup = BeautifulSoup(html, 'lxml')
return soup
def get_data(post_list):
data_list = []
for post in post_list:
title = post.find('div', class_='titlelink box').a.text.strip()
post_link = post.find('div', class_='titlelink box').a['href']
post_link = "https://bbs.hupu.com" + post_link
author = post.find('div', class_='author box').a.text.strip()
author_page = post.find('div', class_='author box').a['href']
start_date = post.find('div', class_='author box').contents[5].text.strip()
reply_view = post.find('span', class_='ansour box').text.strip()
reply = reply_view.split('/')[0].strip()
view = reply_view.split('/')[1].strip()
reply_time = post.find('div', class_='endreply box').a.text.strip()
last_reply = post.find('div', class_='endreply box').span.text.strip()
if ':' in reply_time: # 時(shí)間是11:27
date_time = str(datetime.date.today()) + ' ' + reply_time
date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
else:
date_time = datetime.datetime.strptime('2019-' + reply_time, '%Y-%m-%d').date()
data_list.append([title, post_link, author, author_page, start_date, reply, last_reply, date_time])
return data_list
link = "https://bbs.hupu.com/bxj"
soup = get_page(link)
post_all = soup.find('ul', class_="for-list")
post_list = post_all.find_all('li')
data_list = get_data(post_list)
for each in data_list:
print(each)
3.獲取前50頁數(shù)據(jù)
注意一個(gè)問題:當(dāng)翻到第二頁的時(shí)候,可能新回復(fù)的帖子已經(jīng)將原來第一頁的帖子推到了第二頁,如果還用insert_one方法將數(shù)據(jù)存入MongoDB中,那么同一個(gè)帖子可能會(huì)在數(shù)據(jù)庫中出現(xiàn)兩次記錄,因此需要改用update方法。
代碼如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@File : MongoAPI.py
@Author: Xinzhe.Pang
@Date : 2019/7/10 22:15
@Desc :
"""
import requests
import datetime
import time
from pymongo import MongoClient
from bs4 import BeautifulSoup
# 定義一個(gè)使用MongoDB的類,方便連接數(shù)據(jù)庫、提取數(shù)據(jù)庫中內(nèi)容以及加入或更新數(shù)據(jù)
class MongoAPI(object):
def __init__(self, db_ip, db_port, db_name, table_name):
self.db_ip = db_ip
self.db_port = db_port
self.db_name = db_name
self.table_name = table_name
self.conn = MongoClient(host=self.db_ip, port=self.db_port)
self.db = self.conn[self.db_name]
self.table = self.db[self.table_name]
def get_one(self, query):
return self.table.find_one(query, projection={"_id": False})
def get_all(self, query):
return self.table.find(query)
def add(self, kv_dict):
return self.table.insert(kv_dict)
def delete(self, query):
return self.table.delete_many(query)
def check_exist(self, query):
ret = self.table.find_one(query)
return ret != None
# 如果不存在則會(huì)新建
def update(self, query, kv_dict):
self.table.update_one(query, {'$set': kv_dict}, upsert=True)
# 獲取頁面內(nèi)容
def get_page(link):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
r = requests.get(link, headers=headers)
html = r.content
html = html.decode('UTF-8')
soup = BeautifulSoup(html, 'lxml')
return soup
# 解析頁面數(shù)據(jù)
def get_data(post_list):
data_list = []
for post in post_list:
title = post.find('div', class_='titlelink box').a.text.strip()
post_link = post.find('div', class_='titlelink box').a['href']
post_link = "https://bbs.hupu.com" + post_link
author = post.find('div', class_='author box').a.text.strip()
author_page = post.find('div', class_='author box').a['href']
start_date = post.find('div', class_='author box').contents[5].text.strip()
reply_view = post.find('span', class_='ansour box').text.strip()
reply = reply_view.split('/')[0].strip()
view = reply_view.split('/')[1].strip()
reply_time = post.find('div', class_='endreply box').a.text.strip()
last_reply = post.find('div', class_='endreply box').span.text.strip()
if ':' in reply_time: # 時(shí)間是11:27
date_time = str(datetime.date.today()) + ' ' + reply_time
date_time = datetime.datetime.strptime(date_time, '%Y-%m-%d %H:%M')
else:
date_time = datetime.datetime.strptime('2019-' + reply_time, '%Y-%m-%d').date()
data_list.append([title, post_link, author, author_page, start_date, reply, last_reply, date_time])
return data_list
hupu_post = MongoAPI('111.230.95.186', 27017, 'hupu', 'post')
for i in range(1, 100):
link = "https://bbs.hupu.com/bxj-" + str(i)
soup = get_page(link)
post_all = soup.find('ul', class_="for-list")
if post_all is None:
continue
post_list = post_all.find_all('li')
data_list = get_data(post_list)
for each in data_list:
hupu_post.update({"post_link": each[1]}, {"title": each[0],
"post_link": each[1],
"author": each[2],
"author_page": each[3],
"start_date": str(each[4]),
"reply": each[5],
"last_reply": each[6],
"last_reply_time": str(each[7])})
time.sleep(3)
print('第', i, '頁數(shù)據(jù)獲取完成,暫停3秒')
查看數(shù)據(jù)庫是否寫入:
遇到的問題:'NoneType' object has no attribute 'find_all'
解決方法:在post_list = post_all.find_all('li')之前加入如下代碼:
if post_all is None: continue
?參考資料:《Python網(wǎng)絡(luò)爬蟲從入門到實(shí)踐》
更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動(dòng)力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點(diǎn)擊下面給點(diǎn)支持吧,站長非常感激您!手機(jī)微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點(diǎn)擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
