爬取动态电商网站数据。使用Playwright处理JavaScript渲染的页面,支持Cloudflare反爬、隐躲API发现、分页抓取。适用于: (1) 爬取京东/淘宝/拼多多等中国电商, (2) 爬取Amazon/eBay等国际电商, (3) 价格监控和竞品分析, (4) 批量商品数据采集。
电商动态网站爬虫技能,基于Playwright处理JavaScript渲染。
python
from playwright.syncapi import syncplaywright
def scrape_page(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until=networkidle)
content = page.content()
browser.close()
return content
python
from playwright.syncapi import syncplaywright
import json
import re
def scrapeecommerceproducts(url, max_pages=3):
爬取电商商品数据
products = []
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=[--disable-blink-features=AutomationControlled]
)
context = browser.new_context(
useragent=Mozilla/5.0 (Macintosh; Intel Mac OS X 1015_7) AppleWebKit/537.36
)
page = context.new_page()
# 绕过Cloudflare检测
page.addinitscript(
Object.defineProperty(navigator, webdriver, {
get: () => undefined
});
)
for pagenum in range(1, maxpages + 1):
print(f爬取第 {page_num} 页...)
page.goto(f{url}?page={pagenum}, waituntil=networkidle, timeout=30000)
# 等待商品加载
try:
page.waitforselector(.product-item, .goods-item, [class*=product], timeout=10000)
except:
pass
# 提取商品数据
items = page.queryselectorall(div[class=product], li[class=item], .goods-item)
for item in items:
try:
product = {
title: item.queryselector(a[class*=title], h3, .product-title)?.innertext().strip(),
price: item.queryselector([class*=price], .sale-price, .real-price)?.innertext().strip(),
link: item.queryselector(a)?.getattribute(href),
image: item.queryselector(img)?.getattribute(src),
}
if product[title]:
products.append(product)
except Exception as e:
print(f提取错误: {e})
# 检查是否有下一页
nextbtn = page.queryselector(button:has-text(下一页), a:has-text(下一页))
if not next_btn:
break
browser.close()
return products
不要直接爬页面,先找API:
python
def findhiddenapi(url):
发现页面隐藏的API端点
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# 监听所有网络请求
api_requests = []
page.on(response, lambda response:
api_requests.append(response.url)
if api in response.url.lower() or json in response.url.lower()
else None
)
page.goto(url, wait_until=networkidle)
browser.close()
return [r for r in api_requests if r.startswith(http)]
找API技巧:
python
def bypass_cloudflare(url):
绕过Cloudflare保护
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False, # 非headless更容易通过
args=[
--disable-blink-features=AutomationControlled,
--disable-dev-shm-usage,
]
)
context = browser.new_context(
viewport={width: 1920, height: 1080},
locale=zh-CN,
timezone_id=Asia/Shanghai,
)
page = context.new_page()
# 注入脚本隐藏自动化特征
page.addinitscript(
Object.defineProperty(navigator, webdriver, {get: () => undefined});
Object.defineProperty(navigator, plugins, {get: () => [1, 2, 3]});
Object.defineProperty(navigator, languages, {get: () => [zh-CN, zh, en]});
)
page.goto(url)
# 等待Cloudflare验证完成
try:
page.waitforselector(body, timeout=15000)
print(✅ Cloudflare bypassed!)
except:
print(⚠️ 可能需要手动验证)
content = page.content()
browser.close()
return content
python
def scrapewithpagination(baseurl, maxpages=10):
分页爬取所有商品
all_products = []
page_num = 1
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
while pagenum <= maxpages:
url = f{baseurl}&page={pagenum} if ? in baseurl else f{baseurl}?page={page_num}
print(f爬取第 {pagenum}/{maxpages} 页: {url})
page = browser.new_page()
try:
page.goto(url, wait_until=networkidle, timeout=30000)
except Exception as e:
print(f页面加载失败: {e})
break
# 检查是否最后一页
nextbtn = page.queryselector(button:has-text(下一页), a:has-text(下一页))
if not next_btn:
print(没有更多页面了)
break
# 提取数据...
page_num += 1
browser.close()
return all_products
python
通用电商爬虫脚本 (基础版):
bash
python3 scripts/scrape.py scrape --url https://example.com/products --max-pages 5 --output products.json
支持登录的增强版 (推荐):
bash
支持平台: jd (京东), taobao (淘宝), pdd (拼多多)
隐藏API发现脚本:
bash
python3 scripts/api_discovery.py https://example.com
Cloudflare绕过脚本:
bash
python3 scripts/cloudflare_bypass.py https://example.com --output page.html
def scrape_concurrently(urls):
with ThreadPoolExecutor(max_workers=5) as executor:
results = executor.map(scrape_page, ur
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 ecommerce-scraper-1776298952 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 ecommerce-scraper-1776298952 技能
skillhub install ecommerce-scraper-1776298952
文件大小: 13.3 KB | 发布时间: 2026-4-16 18:30