|
本帖最后由 天天爱玛丽 于 2023-11-10 14:28 编辑
如题,也是在网上找的,专门爬取169tp网图片的Python爬虫程序,目前我只做到爬取每个美女的第一页,有5张图片,实际上都有15张,初学Python,有点不会弄了,希望Python大神能帮我完善这个程序,谢谢,重点不在爬取美女图片 ,在于学习Python,爬取美女,不枯燥,
- import requests
- from pyquery import PyQuery as pq
- from bs4 import BeautifulSoup
- import os
- import re
- import time
- headers={
- 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
- }
- #下载图片的模块,目前仅能读取第一页
- def Download_the_module(file, tehurl):
- count = 1
- # 进入网站下载图片
- response = requests.get(tehurl, headers=headers)
- # 解析为HTML对象,存储在soup变量中
- soup = BeautifulSoup(response.text, "html.parser")
- # 使用select方法选择class为big_img的div标签
- bigimg = soup.select('.big_img')
- # 提取图片地址列表
- srclist = re.findall(r'https.*?jpg', str(bigimg))
- # print("srclist" + str(srclist))
- for imgurl in srclist:
- print("src: " + str(imgurl))
- # 提取图片
- r = requests.get(imgurl, headers=headers)
- dir='./图片/'+file #保存目录
- # print("dir:" + dir)
- save_dir = str(dir)
- # 检测是否有image目录没有则创建
- if not os.path.exists(save_dir):
- os.makedirs(dir)
- else:
- with open(save_dir+'/%s.jpg'%count,'wb')as f:
- f.write(r.content)
- # print('已经下载了%s张'%count)
- count += 1
- #os.system('pause')
- '''
- def GetPageList(page_url):
- # 进入图片首页
- rp = requests.get(page_url, headers=headers)
- # 解析HTML
- soup = BeautifulSoup(rp.text, "html.parser")
- pagelist = soup.select('.dede_pages')
- #print(str(pagelist))
- page_num = re.findall(r'共.*?页', str(pagelist).encode('utf-8'))
- print("page_num:")+ page_num
- '''
- #爬取地址
- def Climb_to_address(a, b):
- for page in range(a, b):
- URL='https://www.169tp.com/gaogensiwa/list_3_%s.html'%page
- sponse=requests.get(URL,headers=headers)
- sponse.encoding='gbk'
- encodin=sponse.text
- doc=pq(encodin)
- extract=doc('.pic').items()
- for i in extract:
- #文件名
- page_name=i.text()
- print("page name: " + page_name)
- #提取到的网站
- page_url=i.attr('href')
- #GetPageList(page_url)
- Download_the_module(page_name, page_url)
- #目前一共有590页,网站的实际页数会不定期变化
- a=int(input('开始页数:'))
- b=int(input('结束页数:'))
- Climb_to_address(a, b)
复制代码
|
评分
-
1
查看全部评分
-
|