本文共 2623 字,大约阅读时间需要 8 分钟。
'''案例:空姐网相册爬虫(bs4)http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1爬取相册照片,用uid + picid +'.jpg'命名,保存到images目录下'''import requestsfrom bs4 import BeautifulSoupimport timeimport randomimport redef get_ua(): ''' 定义请求头 ''' headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36', } return headersdef parse_album_list(url): ''' 提取相册列表 ''' response = requests.get(url,headers=get_ua()) html = response.text soup = BeautifulSoup(html,'lxml') #提取相册列表 ls = soup.select('ul.ml.mla.cl > li > div > a') print('len:',len(ls)) for each in ls: album_url = each['href'] #album_url = each.attrs['href'] #album_url = each.get('href') parse_photo_list(album_url) # 翻页功能 next_page = soup.select('a.nxt') if len(next_page)>0: next_page_url = next_page[0]['href'] parse_album_list(next_page_url)def parse_photo_list(url): ''' 提取指定相册的照片列表 ''' print('album url:',url) response = requests.get(url,headers=get_ua()) html = response.text soup = BeautifulSoup(html,'lxml') # 提取照片列表 ls = soup.select('ul.ptw.ml.mlp.cl > li > a') print('photo list len:',len(ls)) if len(ls)>0: for each in ls: photo_url = each['href'] parse_photo_detail(photo_url) time.sleep(random.random()*2) # 翻页处理 next_page = soup.select('a.nxt') if len(next_page)>0: next_page_url = next_page[0]['href'] print('next page url:',next_page_url) parse_photo_list(next_page_url)def parse_photo_detail(photo_url): ''' 提取照片的详情信息 ''' # 从URL中提取uid和picid # http://www.kongjie.com/home.php?mod=space&uid=48196&do=album&picid=864985 pat = re.compile(r'uid=(\d+).*?picid=(\d+)') match_obj = pat.search(photo_url) if match_obj != None: uid = match_obj.group(1) picid = match_obj.group(2) print('uid:',uid) print('picid:',picid) # 提取照片的链接 response = requests.get(photo_url,headers=get_ua()) html = response.text soup = BeautifulSoup(html,'lxml') img_url = soup.select_one('img#pic')['src'] save_img(img_url,uid,picid)def save_img(url,uid,picid): ''' 下载照片 ''' print('pic_url:',url) response = requests.get(url,headers=get_ua()) img_path = './images/'+uid + "_" +picid +'.jpg' with open(img_path,'wb') as file: file.write(response.content)if __name__ == "__main__": url = "http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1" parse_album_list(url)
转载地址:http://jhlrf.baihongyu.com/