python下载离线的网页以 gitbook上的某个页面
     
  
       
      
      
      
    
        
          
                  
    
    
    先上代码 本代码有参考
https://blog.csdn.net/gorquanwu/article/details/81739589  这篇文章去实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 from  urllib import  requestfrom  bs4 import  BeautifulSoup as  bsimport  timeimport  osimport  re'''     用来爬取网站网页  gitbook 页面 离线下载到本地     实现功能:url深度抓取,保存每个页面的css、html、js等文件 ''' def  get_urls (url, baseurl, urls ):    with  request.urlopen(url) as  f:         data = f.read().decode('utf-8' )                  link = bs(data).find("nav" ).find_all('a' )         for  i in  link:             suffix = i.get('href' )                          if  suffix == '#'  or  suffix == '#carousel-example-generic'  or  'javascript:void(0)'  in  suffix:                 continue              else :                                  childurl = baseurl +"/" + suffix                 if  childurl not  in  urls:                     urls.append(childurl) def  get_source (url, path ):    try :         with  request.urlopen(url) as  f:             html_source = f.read().decode()                          timeStr = str (int (time.time()))             pattertitile = '<title>(.*?)</title>'              patternimg = '<img src="(.*?)"'              titleStr = re.compile (pattertitile, re.S).findall(html_source)[0 ]             if  '|'  in  titleStr:                 title = (titleStr.split("|" )[1 ]).split(' ' )[1 ] + timeStr             else :                 title = titleStr + timeStr             path11 = path + '/'  + title             arrayurl= url.split('/' )             htmlFile = path             for  x in  range (3 , len (arrayurl)):                 htmlFile  +=("/" +arrayurl[x])                          imgHref = re.compile (patternimg, re.S).findall(html_source)                          os.makedirs(os.path.abspath(htmlFile + os.path.sep + ".." ), exist_ok=True )                          with  open (htmlFile, 'w' , encoding='UTF-8' ) as  f:                 f.write(html_source)             print(htmlFile+ "文件保存成功" )             time.sleep(1 )     except :         print(url + "保存html文件时报错" ) def  save_css_js (path ):        url = "http://sdk.g-bim.cn"          filename = path         with  request.urlopen(url) as  total_html:             html_source = total_html.read().decode()             jsHref = re.compile ('<script src="(.*?)"' , re.S).findall(html_source)             cssHref = re.compile ( '<link rel="stylesheet" href="(.*?)"' , re.S).findall(html_source)             for  j in  jsHref :                try :                    with  request.urlopen(url+"/" +j) as  ww:                        js_source = ww.read().decode()                                                filename =path+j                        os.makedirs(os.path.abspath(filename+os.path.sep+".." ), exist_ok=True )                        with  open (filename, 'w' , encoding='UTF-8' ) as  f:                            f.write(js_source)                        print(j.split('/' )[-1 ] + " js文件保存成功" )                        time.sleep(1 )                except :                    print("该"  + j.split('/' )[-1 ] + " js文件无法下载" )             for  k in  cssHref:                 try :                     with  request.urlopen(url+"/" +k) as  vv:                         filename = path+ k                         js_source = vv.read().decode()                         os.makedirs(os.path.abspath(filename + os.path.sep + ".." ), exist_ok=True )                         with  open (filename, 'w' , encoding='UTF-8' ) as  f:                             f.write(js_source)                         print(k.split('/' )[-1 ] + " js文件保存成功" )                         time.sleep(1 )                 except :                     print("该"  + k.split('/' )[-1 ] + " js文件无法下载" ) def  save_img (href, path ):    for  i in  range (0 , len (href)):         url = "http://sdk.g-bim.cn"  + href[i]         filename = path + '\\'  + href[i].split('/' )[-1 ]         try :             with  request.urlopen(url) as  w:                 img_source = w.read()                 with  open (filename, 'wb' ) as  f:                     f.write(img_source)                 print(href[i].split('/' )[-1 ] + " 图像文件保存成功" )                 time.sleep(1 )         except :             print("该"  + href[i].split('/' )[-1 ] + " 图像无法下载" )             continue  if  __name__ == '__main__' :         url = 'http://sdk.g-bim.cn'           baseurl = 'http://sdk.g-bim.cn'           basedir = r'C:\Users\Administrator\Desktop\HTML_bak'      urls = []          get_urls(url, baseurl, urls)               for  u in  urls:         get_source(u,r'../html_bak' ) 
期间有些问题
读取文件是出现UnicodeDecodeError UnicodeDecodeError: ‘gbk’ codec can’t decode byte 0x89 in position 14: illegal
open(‘order.log’,’r’, encoding=’UTF-8’) 尽量指定u8目前HTML的编码一般都是它避免乱码
在python环境下window和Linux分隔符的区别 1 2 3 4 5 6 path=r'../html_bak/'  with open(path, 'w', encoding='UTF-8') as f:    \\windows平台但是 加上r后 自动加 不用管全部用/即可 在打开文件时会自动管理  https://blog.csdn.net/qq_29831163/article/details/106263729   
读取指定文件路径下父文件夹如果不存在着直接新建 1 os.makedirs(os.path.abspath(filename+os.path.sep+".."), exist_ok=True)