Plain Text
复制代码99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import requests
# 即使代理失败,也继续请求,但风险自负
# 3. 添加超时和重试逻辑
if 'timeout' not in kwargs:
kwargs['timeout'] = 15 # 稍微增加超时时间,因为代理可能较慢
max_retries = 3
for attempt in range(max_retries):
try:
print(f"第 {attempt + 1} 次尝试请求: {url}")
response = session.request(method, url, **kwargs)
# 检查响应状态和内容
if response.status_code == 200:
if "验证码" not in response.text and "access denied" not in response.text.lower():
print("请求成功!")
return response
else:
print("请求可能被反爬系统拦截,检测到验证码页面")
else:
print(f"请求失败,状态码: {response.status_code}")
except (requests.exceptions.ProxyError,
requests.exceptions.ConnectTimeout,
requests.exceptions.ReadTimeout,
requests.exceptions.ConnectionError) as e:
print(f"第 {attempt + 1} 次请求失败: {e}")
if attempt < max_retries - 1:
wait_time = 2 ** attempt # 指数退避策略
print(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
else:
print("所有重试均失败")
raise e
# 如果不是网络错误,但请求被拦截,也进行重试
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"请求可能被拦截,等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
return None
# 为session绑定新的方法
session.robust_request = _make_request
return session
# 主爬虫函数
def crawl_fanqie_novel(book_id):
"""爬取番茄小说内容"""
# 初始化代理池管理器
proxy_manager = ProxyPoolManager()
# 创建稳健的Session
session = create_robust_session(proxy_manager)
# 构造目标URL
url = f'https://fanqienovel.com/reader/{book_id}'
print(f"开始爬取: {url}")
try:
# 使用我们自定义的稳健请求方法
response = session.robust_request(url)
if response and response.status_code == 200:
# 此处接续你的HTML解析逻辑
print("成功获取页面内容!")
# 可以在这里添加内容解析代码
# soup = BeautifulSoup(response.text, 'html.parser')
# ... 解析逻辑
return response.text
else:
print("爬取失败,请检查网络或反爬策略。")
return None
except Exception as e:
print(f"爬虫执行过程中发生错误: {e}")
return None
# 使用示例
if __name__ == "__main__":
# 测试爬虫
book_id = '123456789012345'
result = crawl_fanqie_novel(book_id)
if result:
print("爬取成功!")
# 处理爬取到的内容
else:
print("爬取失败!")







待会儿见
K哥馆
mayun
文鼎_应老师
课课家运营团队
liangchsh
启程软考
