[python]基于BT之家的剧集的爬虫
本帖最后由 fengdaokanhai 于 2023-3-8 01:05 编辑最近想追追剧,但是该网站的剧集有的是一集一个bt种子,下载太麻烦,所以就想爬取一下“精华主题的”剧集。
整个程序代码比较简单,主要使用到了requests、os、bs4等常用模块,默认下载目录:D:\torrent,不存在就创建。这里只是试着爬取了一下剧集,如果想爬取该网站其他分类的bt,改一下地址和爬取页数等即可。如果需要爬取不同分类的bt,只要改一下,以input()来接收判断分类和页数即可,很简单,可以自己改一改,如有侵权之处,请删除。
import requestsimport bs4import osimport timeheaders = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.50"}def init_movie_list(pages): """ 根据传入数字决定爬取多少页电影,每页的电影大概几十个 :param pages:爬取页面数量 :return:页面url组成的list """ movie_list = [] # 注意,首页和后面的页数,稍有不同 if pages < 2: # 剧集 精华主题 url = "https://bt5268.com/forum-index-fid-950-digest-1.htm" movie_list.append(url) return movie_list for page in range(1, pages + 1): url = "https://bt5268.com/forum-index-fid-950-digest-1-page-" + str(page) + ".htm" movie_list.append(url) return movie_listdef get_movie_detail_url(url): """ 根据传入的URL获取页面内电影详情链接并存储进List :param url:目标页面URL :return:电影详情页的URL和电影名字组成tuple,各个tuple再连成list """ context = requests.get(url=url, headers=headers).content time.sleep(1) bs4_result = bs4.BeautifulSoup(context, "html.parser") new_read_details = bs4_result.find_all("a", class_="subject_link thread-digest-1") all_details = bs4_result.find_all("a", class_="subject_link thread-old") + new_read_details if not all_details: return [] url_list = [] for item in all_details: url_list.append((item.get("title"), "https://bt5268.com/" + item.get("href"))) return url_listdef get_movie_download_url(url_tuple): """ 传入的tuple为文件夹名和下载链接组合 :param url_tuple: :return: """ folder_name = replace_folder_name(url_tuple) # print( url_tuple) url = url_tuple # resp = requests.get(url=url, headers=headers, proxies=proxies) resp = requests.get(url=url, headers=headers) time.sleep(5) bs4_result = bs4.BeautifulSoup(resp.content, "html.parser") result = bs4_result.find_all("a", rel="nofollow", target="_blank", ajaxdialog=False) if not result: return ('', '', '') file_name = [] bt_url_list = [] for list1 in result: if str(list1).find("attach-dialog") == -1: pass else: file_name.append(replace_folder_name(list1.text)) download_url = "https://bt5268.com/" + list1.get("href").replace("dialog", "download") bt_url_list.append(download_url) return (folder_name, file_name, bt_url_list)def replace_folder_name(folder_name): """ 按照windows系统下的文件命名规则规整化文件夹名 :param folder_name: :return: """ illegal_str = ["?", ",", "/", "\\", "*", "<", ">", "|", " ", "\n", ":"] for item in illegal_str: folder_name = folder_name.replace(item, "") return folder_namedef download_files(f_name, d_url): """ 下载文件 :param input_tuple: :return: """ file_name = f_name download_url = d_url resp = requests.get(url=download_url, headers=headers) time.sleep(5) path = 'D:/torrent' if not os.path.exists(path): os.makedirs(path) with open(path + "/" + file_name, 'wb') as f: f.write(resp.content)if __name__ == '__main__': url = init_movie_list(1) url_list = [] for item in url: url_list = get_movie_detail_url(item) + url_list for i in url_list: download_tuple = get_movie_download_url(i) print(len(download_tuple)) if len(download_tuple) == 0: pass else: if len(download_tuple) == 1: if download_tuple.find("attach-dialog") != -1: download_file(download_tuple) else: for fn, df in zip(download_tuple, download_tuple): # if df != "": # print(type(df)) download_files(fn, df)
看到这帖子真是高兴! 我只是路过打酱油的。 楼主加油,我们都看好你哦。 强烈支持楼主ing…… 激动人心,无法言表! 看到这帖子真是高兴! 激动人心,无法言表! 强烈支持楼主ing…… 看到这帖子真是高兴!
页:
[1]
2