存储网页所有图片

cava原创notebook爬虫技术大约 1 分钟

爬虫技术-存储网页所有图片

import re, requests,cchardet,urllib.parse,traceback
from urllib.parse import urlparse
from urllib.parse import urljoin
from requests.exceptions import RequestException,ProxyError,SSLError

class IMGSAVE(object):
    def __init__(self,url):
        self.url = url

    def ask_url(self,url,headers=None,timeout=10,binary=False,debug=False):
        if "https" not in url:
            url = "https://"+url
        global status,html,redirected_url
        the_headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
    }
        if headers:
            the_headers = headers
        try:
            response=requests.get(url,headers=the_headers,timeout=timeout)
            if response.status_code == 200:
                if binary:
                    html = response.content
                else:
                    encoding = cchardet.detect(response.content)['encoding']
                    html = response.content.decode(encoding)
                status = response.status_code
                redirected_url = response.url

        except(RequestException,ProxyError,SSLError)as e:
            print(e)
            if debug:
                traceback.print_exc()
            msg = 'Failed downled:{}'.format(url)
            print(msg)
            if binary:
                html = b''
            else:
                html = ''
            status=0
        return status,html,redirected_url

    def regex(self,html):
        pattern = re.compile('<img.*?src="(.*?)".*?>',re.S)
        result = re.findall(pattern,html)
        print(result)
        return result

    def save_img(self,filename,content):
        with open(filename,"wb")as fb:
            fb.write(content)

    def suffix(self,path):
        suffix=[".png",".jpg",".jpge"] #可以添加任意后缀到列表中
        for i in suffix:
            if i not in path.lower():
                continue
            return  i
    def start(self,url):
        status, html, redirected_url = self.ask_url(url=url, debug=True)
        template = "{scheme}://{netloc}" 
        the_parse = urlparse(redirected_url)
        url = template.format(scheme=the_parse.scheme, netloc=the_parse.netloc)  # 指定
        print(url)
        r_u = self.regex(html)  # 同下方解析中的path
        if not r_u:
            pass
        img_urls = [urljoin(url, img) for img in r_u]
        # print(img_urls)
        for index, url in enumerate(img_urls):
            postffix = self.suffix(url)
            filename = f'{index}{postffix}'
            content = self.ask_url(url, debug=True, binary=True)[1]
            self.save_img(filename, content)
def main():
    url="www.bbiquge.net"
    work=IMGSAVE(url)
    work.start(url)
if __name__ == '__main__':
    main()

urllib.parse

import urllib.parse

url="https://www.bbiquge.net"
test=urllib.parse.urlparse(url)
print(test)
输出-----------------------
ParseResult(scheme='https', netloc='bornforthis.cn', path='/1v1/17-R/01.html', params='', query='', fragment='')

处理图片链接模板

url="https://bornforthis.cn/assets/image-20230104195900212-3876c2ce.pngxxxxxx"
def suffix(path):
    suffix=[".png",".jpg",".jpge"] #可以添加任意后缀到列表中
    for i in suffix:
        if i not in path:
            continue
        pos=path.find(i)
        print(path[pos:pos+len(i)])
suffix(url)
输出------------------------------------
.png
上次编辑于:
贡献者: AndersonHJB,augwewe
你认为这篇文章怎么样?
  • 0
  • 0
  • 0
  • 0
  • 0
  • 0
  • 0
  • 0
  • 0
评论
  • 按正序
  • 按倒序
  • 按热度
Powered by Waline v2.14.7