Hits: 949
python 爬蟲實戰筆記
紀錄一下從 youtube 學到的爬蟲技巧。目標是把 ptt 電影版第一頁的文章爬下來
單純的抓取靜態網頁
方法1
import urllib.request as req
import bs4
url = 'https://www.ptt.cc/bbs/movie/index.html'
request = req.Request(url, headers={ # 建立 Request 物件,加上 Request Headers,避免 403 forbidden
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
})
with req.urlopen(request) as response: # 讀取 Request 物件
data = response.read().decode('utf-8')
root = bs4.BeautifulSoup(data, 'html.parser')
# 開始執行把所有資訊爬回來
titles = root.find_all('div', class_='title') # 尋找所有 div 標籤,要具有 class='title' 的屬性
for title in titles:
if title.a: # 有 <a> 標籤的文章
print(title.a.string)
方法2
import requests
import bs4
url = 'https://www.ptt.cc/bbs/movie/index.html'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36 OPR/62.0.3331.116"}
r = requests.get(url) # 將此頁面的HTML GET下來
soup = bs4.BeautifulSoup(r.text, 'html.parser') # 不用 header
print(soup)
titles = soup.find_all('div', class_='title')
for title in titles:
if title.a:
print(title.a.string)
動態的追蹤網頁並自動化抓取
一次抓取三頁的 ptt 八卦版
import urllib.request as req
import bs4
def craw(url):
request = req.Request(url, headers={ # 建立 Request 物件,加上 Request Headers,避免 403 forbidden
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'cookie': 'over18=1' # 這邊要從開發者工具,去找到 cookies並設定,不然抓不到資料
})
with req.urlopen(request) as response: # 讀取 Request 物件
data = response.read().decode('utf-8')
root = bs4.BeautifulSoup(data, 'html.parser')
# 開始執行把所有資訊爬回來
titles = root.find_all('div', class_='title') # 尋找所有 div 標籤,要具有 class='title' 的屬性
for title in titles:
if title.a: # 有 <a> 標籤的文章
print(title.a.string)
# 抓出下一個要追蹤的超連結
next_url = root.find('a', string=r'‹ 上頁') # 去找 a 標籤中有上頁的文字
return next_url['href'] # 我要 <a> 的標籤內 href 屬性
target_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
counter = 0
while counter < 3:
target_url = 'https://www.ptt.cc' + craw(target_url)
print('----this is page', counter+1, '----')
counter += 1
Comments