python3 爬虫爬取1024图片区

栏目: Python · 发布时间: 7年前

内容简介：python3 爬虫爬取1024图片区

接触 python 有一段时间了，一直想写个爬虫，然而最近临近期末实在没什么时间，就做了个demo出来，有的时候会出现一些error,但是跑还是能跑起来，下个几百张图片还是没问题，剩下的问题估计要到放假才能解决好了，先把代码放上来，以供交流，欢迎大家提出指导意见

进入正题

我写这个爬虫的时候参考了纯洁的微笑的博客，思路基本差不多，把他的那篇博客也贴出来：http://www.cnblogs.com/ityouknow/p/6013074.html

我的代码如下

from bs4 import BeautifulSoup

import re
import os
import requests
import json
import time

import OpenSSL

mainsite="http://1024的网址就不贴了.com/"
def getbs(url):
        header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Referer":"http://t66y.com//thread0806.php?fid=16&search=&page=1",
                "Host":"t66y.com"
                }
        req=requests.get(url,headers=header)
        req.encoding="gbk"#这里因为1024图片帖子内的编码是gbk,如果不指明编码，得到的是乱码
        bsobj = BeautifulSoup(req.text, "html5lib")
        return bsobj

def getallpage(start,end):
        urls=[]
        for i in range(start,end+1):
                url="http://地址打码/thread0806.php?fid=16&search=&page={}".format(str(i))
                bsobj=getbs(url)
                urls+=bsobj.find_all("a",{"href":re.compile("^htm_data.*")})
        return urls
def getpicofpage(url):
        bsobj=getbs(url)
        div=bsobj.find("div",{"class":"tpc_content do_not_catch"})
        if div==None:
                print("获取不到内容，跳过")
                return -1
        inputs=div.find_all("input")
        title=bsobj.find("h4").text
        if inputs==[]:
                print("本页无图片，跳过")
                return -1
        num=1
        if os.path.exists(path + "new\\tupian\\" + "\\" + title)==False:
                os.mkdir(path + "new\\tupian\\" + "\\" + title)
        else:
                print("已存在该文件夹，跳过")
                return -1
        for i in inputs:
                try:#问题主要出在这里
                        res = requests.get(i["src"],timeout=25)
                        with open(path +"new\\tupian\\"+"\\"+title+"\\"+str(time.time())[:10]+".jpg", 'wb') as f:
                                f.write(res.content)
                except requests.exceptions.Timeout:#爬图片时有的会超时，如果不设置超时，可能会一直卡在那里


                        print("已超时，跳过本页")
                        return -1
                except OpenSSL.SSL.WantReadError:#这里也是个问题，有的时候会跳出这个异常，但是我这里是捕捉不到的，这个异常到底是怎么回事，我还没弄清楚
                        print("OpenSSL.SSL.WantReadError,跳过")
                        return -1
                print(num)
                num+=1
l=getallpage(5,10)
page=1
ed=[]
for i in l:
        url=mainsite+i["href"]
        if url in ed:
                print(url+"本页已采集过，跳过")
                continue
        print(url)
        getpicofpage(url)
        ed.append(url)
        print("采集完第{}页".format(page))
        page+=1
        time.sleep(3)

另外也把上面说的ssl异常贴出来：

Traceback (most recent call last):

File "D:\python\Lib\site-packages\urllib3\contrib\pyopenssl.py", line 441, in wrap_socket

cnx.do_handshake()

File "D:\python\Lib\site-packages\OpenSSL\SSL.py", line 1806, in do_handshake

self._raise_ssl_error(self._ssl, result)

File "D:\python\Lib\site-packages\OpenSSL\SSL.py", line 1521, in _raise_ssl_error

raise WantReadError()

OpenSSL.SSL.WantReadError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

File "D:\python\Lib\site-packages\urllib3\connectionpool.py", line 595, in urlopen

self._prepare_proxy(conn)

File "D:\python\Lib\site-packages\urllib3\connectionpool.py", line 816, in _prepare_proxy

conn.connect()

File "D:\python\Lib\site-packages\urllib3\connection.py", line 326, in connect

ssl_context=context)

File "D:\python\Lib\site-packages\urllib3\util\ssl_.py", line 329, in ssl_wrap_socket

return context.wrap_socket(sock, server_hostname=server_hostname)

File "D:\python\Lib\site-packages\urllib3\contrib\pyopenssl.py", line 445, in wrap_socket

raise timeout('select timed out')

socket.timeout: select timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

File "D:\python\Lib\site-packages\requests\adapters.py", line 440, in send

timeout=timeout

File "D:\python\Lib\site-packages\urllib3\connectionpool.py", line 639, in urlopen

_stacktrace=sys.exc_info()[2])

File "D:\python\Lib\site-packages\urllib3\util\retry.py", line 388, in increment

raise MaxRetryError(_pool, url, error or ResponseError(cause))

urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.srimg.com', port=443): Max retries exceeded with url: /u/20180104/11315126.jpg (Caused by ProxyError('Cannot connect to proxy.', timeout('select timed out',)))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

File "D:\PyCharm 2017.3.1\helpers\pydev\pydev_run_in_console.py", line 52, in run_file

pydev_imports.execfile(file, globals, locals) # execute the script

File "D:\PyCharm 2017.3.1\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile

exec(compile(contents+"\n", file, 'exec'), glob, loc)

File "D:/learnPython/crawler/crawler.py", line 301, in <module>

getpicofpage(url)

File "D:/learnPython/crawler/crawler.py", line 281, in getpicofpage

res = requests.get(i["src"],timeout=25)

File "D:\python\Lib\site-packages\requests\api.py", line 72, in get

return request('get', url, params=params, **kwargs)

File "D:\python\Lib\site-packages\requests\api.py", line 58, in request

return session.request(method=method, url=url, **kwargs)

File "D:\python\Lib\site-packages\requests\sessions.py", line 508, in request

resp = self.send(prep, **send_kwargs)

File "D:\python\Lib\site-packages\requests\sessions.py", line 618, in send

r = adapter.send(request, **kwargs)

File "D:\python\Lib\site-packages\requests\adapters.py", line 502, in send

raise ProxyError(e, request=request)

requests.exceptions.ProxyError: HTTPSConnectionPool(host='www.srimg.com', port=443): Max retries exceeded with url: /u/20180104/11315126.jpg (Caused by ProxyError('Cannot connect to proxy.', timeout('select timed out',)))

PyDev console: starting.

还有一点，虽然我开了vpn,但是直接爬是获取不到内容的，会提示主机没有响应，但是后来发现开了fiddler就能爬了，估计是ip的原因，这个我还没仔细深究，也请各位不吝赐教

以上就是本文的全部内容，希望本文的内容对大家的学习或者工作能带来一定的帮助，也希望大家多多支持码农网

查看所有标签

猜你喜欢:

本站部分资源来源于网络，本站转载出于传递更多信息之目的，版权归原作者或者来源机构所有，如转载稿涉及版权问题，请联系我们。

码农书籍

谁说商业直觉是天生的

[美] 戴夫·帕特奈克 (Dev Patnaik)、[美] 彼得·莫特森 (Peter Mortensen) / 马慧 / 万卷出版公司 / 2010-07 / 36.00

《Wired to Care》是帕特奈克集近年来在创新顾问公司 Jump Associates 实务经验，与史丹佛大学教学经验之大成，虽然《Wired to Care》定位为一本用设计创新方法谈企业管理的书，但本书，活像是一本近代的设计史，从以销售为设计目标的Raymond Loewy谈起，到以人为设计中心的OXO GOOD GRIPSSwivelPeeler削皮刀。由此作者向我们揭示了企业如何运......一起来看看《谁说商业直觉是天生的》这本书的介绍吧!

码农工具