Python3.6版本,代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
import re import urllib.request import ssl ssl._create_default_https_context=ssl._create_unverified_context def getimgtwo(url,page): req=urllib.request.Request(url) allcont=urllib.request.urlopen(req).read() # print(allcont) pattern=re.compile('<div class="MeinvTuPianBox".*?</ul>') getimgstr=re.search(pattern,str(allcont)).group(0) # print(getimgstr) patternimg=re.compile('<img src="http://.*?.jpg') guolv=re.findall(patternimg,getimgstr) x=1 for item in guolv: patternimgurl=re.compile('http://.*?.jpg') imgurl=re.search(patternimgurl,item).group(0) print(str(x)+"->"+imgurl) imagename="/Users/adongblog/Desktop/MyGame/Python/Img/meinv"+str(page)+"_"+str(x)+".jpg" try: urllib.request.urlretrieve(imgurl,filename=imagename) except urllib.error.URLError as e: if hasattr(e,"code"): print("add") if hasattr(e,"reson"): print("add2") x+=1 for i in range(1,10): url="http://www.27270.com/ent/meinvtupian/list_11_"+str(i)+".html" getimgtwo(url,i) |
一下子爬取了几百张图片,嘿嘿!
接下来爬取大图
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import re import urllib.request import ssl ssl._create_default_https_context=ssl._create_unverified_context def getimgtwo(url,page,name): req=urllib.request.Request(url) allcont=urllib.request.urlopen(req).read() # print(allcont) pattern=re.compile('<div class="articleV4Body".*?</a>') getimgstr=re.search(pattern,str(allcont)).group(0) # print(getimgstr) patternimg=re.compile('<img.*?src="http://.*?.jpg') guolv=re.findall(patternimg,getimgstr) x=1 for item in guolv: patternimgurl=re.compile('http://.*?.jpg') imgurl=re.search(patternimgurl,item).group(0) print(str(page)+"->"+imgurl) imagename="/Users/adongblog/Desktop/MyGame/Python/ImgBig/"+name+"_"+str(page)+".jpg" try: urllib.request.urlretrieve(imgurl,filename=imagename) except urllib.error.URLError as e: if hasattr(e,"code"): print("add") if hasattr(e,"reson"): print("add2") x+=1 listid=258932 for i in range(1,8): url="http://www.27270.com/ent/meinvtupian/2018/"+str(listid)+"_"+str(i)+".html" getimgtwo(url,i,str(listid)) |
代码片段三
再来补全一下,通过输入列表,抓取列表中所有的大图,再做个小的窗口界面更直观一些如下图。
贴出代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import re import urllib.request import ssl import time import tkinter as tk from tkinter import messagebox ssl._create_default_https_context=ssl._create_unverified_context window=tk.Tk() window.title("抓美女,嘿嘿!") window.geometry('500x100') e=tk.Entry(window,show=None,width=60) e.pack() timename=time.strftime('%Y%m%d',time.localtime(time.time())) def getimgtwo(url,listid,page): req=urllib.request.Request(url) allcont=urllib.request.urlopen(req).read() pattern=re.compile('<div class="articleV4Body".*?.jpg') getimgstr=re.search(pattern,str(allcont)).group(0) patternimg=re.compile('<img.*?src="http://.*?.jpg') guolv=re.findall(patternimg,getimgstr) x=1 for item in guolv: patternimgurl=re.compile('http://.*?.jpg') imgurl=re.search(patternimgurl,item).group(0) print(imgurl) imagename="D:/python/ImgBigList2/"+timename+"_"+str(page)+"_"+str(listid)+".jpg" x+=1 try: urllib.request.urlretrieve(imgurl,filename=imagename) except urllib.error.URLError as e: if hasattr(e,"code"): print("add") if hasattr(e,"reson"): print("add2") # 获取列表 def getimglist(url): req=urllib.request.Request(url) allcont=urllib.request.urlopen(req).read() pattern=re.compile('<div class="MeinvTuPianBox".*?</ul>') getimgstr=re.search(pattern,str(allcont)).group(0) patternimg=re.compile('<a href="http:.*?.html.*?"MMPic"') guolv=re.findall(patternimg,getimgstr) y=1 for item in guolv: patternimgurl=re.compile('http://.*?.html') imgurl=re.search(patternimgurl,item).group(0) print(str(y)+imgurl) try: for j in range(1,8): pattimgurl=imgurl.replace('.html','') newimgurl=pattimgurl+"_"+str(j)+".html" getimgtwo(newimgurl,y,j) except urllib.error.URLError as e: if hasattr(e,"code"): print("add") if hasattr(e,"reson"): print("add2") y+=1 def getimgbtn(): varurl=e.get() if varurl=='': tk.messagebox.showerror(title='错误',message='输入网址为空') else: getimglist(varurl) btn=tk.Button(window,text='开始',width=15,height=2,command=getimgbtn).pack() window.mainloop() |
输入列表网址,点击开始后,自动工作了,大约一个列表200张图尽收文件夹,嘿嘿!