在当今数据驱动的时代,网络爬虫已成为获取互联网信息的重要手段。然而,许多网站如12306都实施了严格的反爬虫机制,特别是对于动态加载的内容。本文将详细介绍如何使用Selenium模拟真实浏览器行为,有效绕过这些限制,成功抓取12306旅游产品数据。
12306反爬机制分析
12306作为中国铁路官方售票平台,对其旅游产品数据实施了多层次防护:
动态内容加载:大量使用JavaScript异步加载数据,传统爬虫无法获取
验证码系统:复杂图片验证码和滑动验证码阻止自动化访问
请求头检测:验证User-Agent、Referer等头部信息
行为分析:检测鼠标移动、点击模式等人类特征
IP限制:频繁请求会导致IP地址被暂时封锁
Selenium技术简介
Selenium是一个自动化Web测试工具,但其浏览器自动化能力使其成为应对反爬策略的利器:
真实浏览器环境:完全模拟用户操作,生成合法流量模式
JavaScript执行:能够处理动态加载内容
元素交互:可以模拟点击、输入等用户行为
跨平台支持:支持Chrome、Firefox、Edge等主流浏览器
环境准备与配置
所需工具和库安装
首先确保安装以下Python库:
bash
pip install selenium beautifulsoup4 pandas webdriver-manager
WebDriver管理
Selenium需要对应浏览器的驱动程序。推荐使用webdriver-manager自动管理驱动:
from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options def setup_driver(): chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--start-maximized') # 最大化窗口 # 使用webdriver-manager自动下载和管理ChromeDriver service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # 执行CDP命令隐藏WebDriver特征 driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) ''' }) return driver
爬虫设计与实现
页面分析与等待策略
12306旅游产品页面()采用动态加载方式,需要合理设置等待时间:
from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time def wait_for_element(driver, by, value, timeout=10): """等待元素出现""" try: element = WebDriverWait(driver, timeout).until( EC.presence_of_element_located((by, value)) ) return element except Exception as e: print(f"元素加载超时: {value}") return None def slow_scroll(driver, scroll_pause_time=0.5): """模拟人类缓慢滚动页面""" last_height = driver.execute_script("return document.body.scrollHeight") while True: # 随机滚动距离,模拟人类行为 scroll_height = random.randint(300, 800) driver.execute_script(f"window.scrollBy(0, {scroll_height});") time.sleep(scroll_pause_time) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height
处理验证码挑战
当遇到验证码时,我们需要人工干预或使用OCR服务:
def handle_verification(driver): """处理验证码挑战""" try: # 检查是否有验证码出现 captcha_element = wait_for_element(driver, By.ID, "code", timeout=5) if captcha_element: print("验证码出现,请手动解决...") # 暂停程序,等待用户手动处理验证码 input("解决验证码后按回车键继续...") return True except: pass return False
数据提取实现
提取旅游产品信息的完整实现:
from bs4 import BeautifulSoup import pandas as pd import re def extract_tour_products(driver): """提取旅游产品信息""" # 等待产品列表加载 wait_for_element(driver, By.CLASS_NAME, "product-list", timeout=15) # 缓慢滚动加载所有内容 slow_scroll(driver) # 获取页面源码并使用BeautifulSoup解析 page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') products = [] product_items = soup.find_all('div', class_='product-item') for item in product_items: try: # 提取产品名称 name_elem = item.find('div', class_='product-name') name = name_elem.text.strip() if name_elem else "未知产品" # 提取价格信息 price_elem = item.find('div', class_='product-price') price_text = price_elem.text.strip() if price_elem else "0" price = re.search(r'(\\d+)', price_text.replace(',', '')) price = int(price.group(1)) if price else 0 # 提取出发地/目的地 route_elem = item.find('div', class_='product-route') route = route_elem.text.strip() if route_elem else "" # 提取产品详情链接 link_elem = item.find('a', href=True) link = "https://kyfw.12306.cn" + link_elem['href'] if link_elem else "" # 提取产品图片 img_elem = item.find('img', src=True) img_url = img_elem['src'] if img_elem else "" products.append({ 'name': name, 'price': price, 'route': route, 'link': link, 'image_url': img_url, 'crawled_time': pd.Timestamp.now() }) except Exception as e: print(f"提取产品信息时出错: {e}") continue return products
完整爬虫流程
整合以上功能的完整实现:
import random import json from datetime import datetime import time import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import re # 代理信息 proxyHost = "www.16yun.cn" proxyPort = "5445" proxyUser = "16QMSOML" proxyPass = "280651" def get_proxy_url(): """生成带认证的代理URL""" return f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}" def setup_driver(): """设置带代理的浏览器实例""" chrome_options = Options() # 添加代理服务器配置(带认证) proxy_url = get_proxy_url() chrome_options.add_argument(f'--proxy-server={proxy_url}') # 其他配置 chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--start-maximized') # 使用webdriver-manager自动下载和管理ChromeDriver service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # 执行CDP命令隐藏WebDriver特征 driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) ''' }) return driver def wait_for_element(driver, by, value, timeout=10): """等待元素出现""" try: element = WebDriverWait(driver, timeout).until( EC.presence_of_element_located((by, value)) ) return element except Exception as e: print(f"元素加载超时: {value}") return None def slow_scroll(driver, scroll_pause_time=0.5): """模拟人类缓慢滚动页面""" last_height = driver.execute_script("return document.body.scrollHeight") while True: # 随机滚动距离,模拟人类行为 scroll_height = random.randint(300, 800) driver.execute_script(f"window.scrollBy(0, {scroll_height});") time.sleep(scroll_pause_time) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height def handle_verification(driver): """处理验证码挑战""" try: # 检查是否有验证码出现 captcha_element = wait_for_element(driver, By.ID, "code", timeout=5) if captcha_element: print("验证码出现,请手动解决...") # 暂停程序,等待用户手动处理验证码 input("解决验证码后按回车键继续...") return True except: pass return False def extract_tour_products(driver): """提取旅游产品信息""" # 等待产品列表加载 wait_for_element(driver, By.CLASS_NAME, "product-list", timeout=15) # 缓慢滚动加载所有内容 slow_scroll(driver) # 获取页面源码并使用BeautifulSoup解析 page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') products = [] product_items = soup.find_all('div', class_='product-item') for item in product_items: try: # 提取产品名称 name_elem = item.find('div', class_='product-name') name = name_elem.text.strip() if name_elem else "未知产品" # 提取价格信息 price_elem = item.find('div', class_='product-price') price_text = price_elem.text.strip() if price_elem else "0" price = re.search(r'(\\d+)', price_text.replace(',', '')) price = int(price.group(1)) if price else 0 # 提取出发地/目的地 route_elem = item.find('div', class_='product-route') route = route_elem.text.strip() if route_elem else "" # 提取产品详情链接 link_elem = item.find('a', href=True) link = "https://kyfw.12306.cn" + link_elem['href'] if link_elem else "" # 提取产品图片 img_elem = item.find('img', src=True) img_url = img_elem['src'] if img_elem else "" products.append({ 'name': name, 'price': price, 'route': route, 'link': link, 'image_url': img_url, 'crawled_time': pd.Timestamp.now() }) except Exception as e: print(f"提取产品信息时出错: {e}") continue return products def crawl_12306_tours(): """爬取12306旅游产品的完整流程""" print("启动浏览器(使用代理服务器)...") print(f"代理服务器: {proxyHost}:{proxyPort}") driver = setup_driver() try: # 访问12306旅游产品页面 print("访问12306旅游产品页面...") driver.get("https://kyfw.12306.cn/otn/product/index.html") # 等待页面加载 time.sleep(3) # 检查并处理验证码 if handle_verification(driver): # 验证码处理后重新等待页面加载 time.sleep(3) # 提取产品信息 print("正在提取旅游产品信息...") products = extract_tour_products(driver) # 保存数据 if products: df = pd.DataFrame(products) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"12306_tours_{timestamp}.csv" df.to_csv(filename, index=False, encoding='utf-8-sig') print(f"成功提取 {len(products)} 个旅游产品,已保存到 {filename}") # 同时保存JSON格式 json_filename = f"12306_tours_{timestamp}.json" with open(json_filename, 'w', encoding='utf-8') as f: json.dump(products, f, ensure_ascii=False, indent=2) print(f"JSON数据已保存到 {json_filename}") else: print("未找到旅游产品信息") return products except Exception as e: print(f"爬取过程中发生错误: {e}") return [] finally: # 关闭浏览器 driver.quit() print("浏览器已关闭") # 执行爬虫 if __name__ == "__main__": crawl_12306_tours()
道德与法律考量
在使用Selenium爬取12306数据时,必须注意以下道德和法律问题:
遵守robots.txt:检查目标网站的爬虫政策
控制请求频率:避免对服务器造成过大压力
数据使用限制:仅将数据用于个人学习和研究目的
用户隐私保护:不收集、存储或分享任何用户个人信息
版权尊重:遵守12306网站上内容的版权声明