相关视频——Python爬虫编程基础5天速成(2021全新合集)Python入门+数据分析
找到一个div,在div里面翻找你要的内容。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
from bs4 import BeautifulSoup import re import urllib.request,urllib.error
findLink = re.compile(r'<a href=''(.*?)''>')
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)
findTitle = re.compile(r'<span class="title">(.*)</span>')
def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getData(baseurl)
def getData(baseurl): for i in range(0,1): url = baseurl + str(i*25) html = askURL(url) soup = BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"): data = [] item = str(item)
link = re.findall(findLink,item)[0] data.append(link) imgSrc = re.findall(findImgSrc,item)[0] data.append(imgSrc) titles = re.findall(findTitle,item) if(len(titles)==2): ctitle = titles[0] data.append(ctitle) otitle = titles[1].replace("/","") else: data.append(titles[0]) data.append(' ') print(data)
def askURL(url): head = { "User-Agent":"Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664.110Safari / 537.36" } request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") return html except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason )
if __name__ == "__main__": main()
|