安装依赖pip install requests beautifulsoup4 trafilatura tqdm爬虫代码#!/usr/bin/env python3 Rocketech 知识库爬虫用于 RAG 数据集 用法: python crawl_rocketech.py [--max_pages 500] [--delay 2] [--output data.jsonl] import requests import trafilatura from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser import time import json import argparse import os from tqdm import tqdm class RocketechCrawler: def __init__(self, start_url, output_filerocketech_data.jsonl, max_pages500, delay2.0): self.start_url start_url self.domain urlparse(start_url).netloc self.base_url fhttps://{self.domain}/ self.output_file output_file self.max_pages max_pages self.delay delay # 已访问/待访问队列 self.visited set() self.to_visit [start_url] # robots.txt 规则 self.rp RobotFileParser() self.rp.set_url(fhttps://{self.domain}/robots.txt) try: self.rp.read() except Exception: print(无法获取 robots.txt将不限制爬取。) self.rp None # 用于去重的 URL 归一化 self.session requests.Session() self.session.headers.update({ User-Agent: RocketechRAGBot/1.0 (research project; contactexample.com) }) def can_fetch(self, url): 检查 robots.txt 是否允许抓取 if self.rp is None: return True return self.rp.can_fetch(self.session.headers[User-Agent], url) def normalize_url(self, url): 去掉 fragment尾斜杠统一忽略大小写域名部分 parsed urlparse(url) norm parsed._replace(fragment, query).geturl() norm norm.rstrip(/) return norm def extract_links(self, html, current_url): 从页面中提取所有同域链接 soup BeautifulSoup(html, html.parser) links set() for a in soup.find_all(a, hrefTrue): href a[href].strip() full_url urljoin(current_url, href) parsed urlparse(full_url) # 只保留 http/https 同域链接 if parsed.scheme in (http, https) and parsed.netloc self.domain: # 过滤掉资源文件图片、PDF等 if not any(parsed.path.lower().endswith(ext) for ext in [.pdf, .zip, .png, .jpg, .jpeg, .gif, .mp3, .mp4, .doc, .docx]): normalized self.normalize_url(full_url) if normalized not in self.visited: links.add(normalized) return links def process_page(self, url): 抓取并提取一个页面的文本内容 try: resp self.session.get(url, timeout30) if resp.status_code ! 200: return None, None html resp.text except Exception as e: print(f 请求失败 {url}: {e}) return None, None # 提取正文trailatura 自动剔除导航/页脚/广告 downloaded trafilatura.extract(html, include_commentsFalse, include_tablesFalse, no_fallbackFalse, favor_precisionTrue) if not downloaded: # 备选直接用BeautifulSoup取文本 soup BeautifulSoup(html, html.parser) for script in soup([script, style, nav, footer, header]): script.decompose() text soup.get_text(separator\n) text \n.join(line.strip() for line in text.splitlines() if line.strip()) else: text downloaded.strip() # 标题 soup BeautifulSoup(html, html.parser) title soup.title.string.strip() if soup.title else url return title, text def save_record(self, record): 追加一行到 JSONL with open(self.output_file, a, encodingutf-8) as f: f.write(json.dumps(record, ensure_asciiFalse) \n) def crawl(self): print(f开始爬取 {self.start_url}目标页数{self.max_pages}) pbar tqdm(totalself.max_pages, desc已爬取页面) while self.to_visit and len(self.visited) self.max_pages: # 取下一个 URL url self.to_visit.pop(0) norm_url self.normalize_url(url) if norm_url in self.visited: continue # 检查 robots.txt if not self.can_fetch(url): self.visited.add(norm_url) continue print(f\n 正在处理: {url}) title, text self.process_page(url) # 保存数据 if text and len(text) 50: # 过滤过短页面 record { url: url, title: title, text: text, timestamp: time.time() } self.save_record(record) print(f 保存成功正文长度: {len(text)} 字符) else: print(f 跳过无有效正文) # 标记已访问并提取链接 self.visited.add(norm_url) if len(self.visited) self.max_pages: html None try: resp self.session.get(url, timeout10) if resp.status_code 200: html resp.text except Exception: pass if html: new_links self.extract_links(html, url) self.to_visit.extend(new_links) # 去重保留顺序 self.to_visit list(dict.fromkeys(self.to_visit)) pbar.update(1) # 礼貌延迟 time.sleep(self.delay) pbar.close() print(f\n爬取完成共处理 {len(self.visited)} 个页面数据保存至 {self.output_file}) if __name__ __main__: parser argparse.ArgumentParser(descriptionRocketech 网站爬虫RAG数据集) parser.add_argument(--start_url, default网页网址, help起始URL) parser.add_argument(--output, defaultrocketech_data.jsonl, help输出JSONL文件) parser.add_argument(--max_pages, typeint, default500, help最大爬取页数) parser.add_argument(--delay, typefloat, default2.0, help请求间隔(秒)) args parser.parse_args() crawler RocketechCrawler( start_urlargs.start_url, output_fileargs.output, max_pagesargs.max_pages, delayargs.delay ) crawler.crawl()保存crawl_rocketech.py。执行python crawl_rocketech.py --max_pages 200 --delay 1.5