源码下载链接:ppt.rar - 蓝奏云
PPT下载链接:https://pan.baidu.com/s/1oOIO76xhSw283aHTDhBcPg?pwd=dydk
提取码:dydk
采集的参数
page_count = 1 # 每个栏目开始业务content="text/html; charset=gb2312"base_url = "https://sc.chinaz.com/" # 采集的网址 https://sc.chinaz.com/tag_ppt/zhongguofeng.htmlsave_path = "D:\\Sprider\\ChinaZ\\"sprider_count = 110 # 采集数量haved_sprider_count = 0 # 已经采集的数量word_content_list = []folder_name = ""first_column_name = "ppt"sprider_start_count=800 # 从第几个序号开始 直接改数量即可 会做除法操作正 正在采集第32页的第16个资源 debugmax_pager=20 #每页的数量
采集主体代码
def sprider(self, second_column_name):"""采集Coder代码:return:"""if second_column_name == "zhongguofeng":self.folder_name = "中国风"self.first_column_name="tag_ppt"elif second_column_name == "xiaoqingxin":self.folder_name = "小清新"self.first_column_name = "tag_ppt"elif second_column_name == "kejian":self.folder_name = "课件"self.first_column_name = "ppt"merchant = int(self.sprider_start_count) // int(self.max_pager) + 1second_folder_name = str(self.sprider_count) + "个" + self.folder_nameself.save_path = self.save_path+ os.sep + "PPT" + os.sep + second_folder_nameBaseFrame().debug("开始采集ChinaZPPT...")sprider_url = (self.base_url + "/" + self.first_column_name + "/" + second_column_name + ".html")response = requests.get(sprider_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))response.encoding = 'UTF-8'soup = BeautifulSoup(response.text, "html5lib")#print(soup)div_list = soup.find('div', attrs={"class": 'ppt-list'})div_list =div_list.find_all('div', attrs={"class": 'item'})#print(div_list)laster_pager_url = soup.find('a', attrs={"class": 'nextpage'})laster_pager_url = laster_pager_url.previous_sibling#89page_end_number = int(laster_pager_url.find('b').string)#print(page_end_number)self.page_count = merchantwhile self.page_count <= int(page_end_number): # 翻完停止try:if self.page_count == 1:self.sprider_detail(div_list,self.page_count,page_end_number)else:if self.haved_sprider_count == self.sprider_count:BaseFrame().debug("采集到达数量采集停止...")BaseFrame().debug("开始写文章...")self.builder_word(self.folder_name, self.save_path, self.word_content_list)BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!")break#https://www.a5xiazai.com/android/youxi/qipaiyouxi/list_913_1.html#https://www.a5xiazai.com/android/youxi/qipaiyouxi/list_913_2.html#next_url = sprider_url + "/list_{0}_{1}.html".format(str(url_index), self.page_count)# (self.base_url + "/" + first_column_name + "/" + second_column_name + "/"+three_column_name+"")next_url =(self.base_url + "/" + self.first_column_name + "/" + second_column_name + "_{0}.html").format(self.page_count)# (self.base_url + "/" + self.first_column_name + "/" + second_column_name + "")+"/list_{0}_{1}.html".format(str(self.url_index), self.page_count)response = requests.get(next_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))response.encoding = 'UTF-8'soup = BeautifulSoup(response.text, "html5lib")div_list = soup.find('div', attrs={"class": 'ppt-list'})div_list = div_list.find_all('div', attrs={"class": 'item'})self.sprider_detail(div_list, self.page_count,page_end_number)passexcept Exception as e:print("sprider()执行过程出现错误" + str(e))passself.page_count = self.page_count + 1 # 页码增加1def sprider_detail(self, element_list, page_count,max_page):try:element_length = len(element_list)self.sprider_start_index = int(self.sprider_start_count) % int(self.max_pager)index = self.sprider_start_indexwhile index < element_length:a=element_list[index]if self.haved_sprider_count == self.sprider_count:BaseFrame().debug("采集到达数量采集停止...")breakindex = index + 1sprider_info = "正在采集第" + str(page_count) + "页的第" + str(index) + "个资源"BaseFrame().debug(sprider_info)title_image_obj = a.find('img', attrs={"class": 'lazy'})url_A_obj=a.find('a', attrs={"class": 'name'})next_url = self.base_url+url_A_obj.get("href")coder_title = title_image_obj.get("alt")response = requests.get(next_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))response.encoding = 'UTF-8'soup = BeautifulSoup(response.text, "html5lib")#print(next_url)down_load_file_div = soup.find('div', attrs={"class": 'download-url'})if down_load_file_div is None:BaseFrame().debug("需要花钱无法下载因此跳过哦....")continuedown_load_file_url = down_load_file_div.find('a').get("href")#print(down_load_file_url)image_obj = soup.find('div', attrs={"class": "one-img-box"}).find('img')image_src = "https:"+ image_obj.get("data-original")#print(image_src)if (DownLoad(self.save_path).__down_load_file__(down_load_file_url, coder_title, self.folder_name)):DownLoad(self.save_path).down_cover_image__(image_src, coder_title) # 资源的 封面sprider_content = [coder_title,self.save_path + os.sep + "image" + os.sep + coder_title + ".jpg"] # 采集成功的记录self.word_content_list.append(sprider_content) # 增加到最终的数组self.haved_sprider_count = self.haved_sprider_count + 1BaseFrame().debug("已经采集完成第" + str(self.haved_sprider_count) + "个")if (int(page_count) == int(max_page)):self.builder_word(self.folder_name, self.save_path, self.word_content_list)BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!")except Exception as e:print("sprider_detail:" + str(e))pass
采集的文件名
初中化学实验课件ppt模板
开学第一课开学季ppt模板设计
大学生情绪压力管理ppt模板课件
简约风格幼小衔接ppt课件免费下载
高考填报志愿课件免费ppt模板下载
岳阳楼记教学设计ppt课件
岳阳楼记ppt课件免费下载第3课时
岳阳楼记ppt课件免费下载第2课时
岳阳楼记ppt课件免费下载第1课时
岳阳楼记译文ppt课件