AI-powered web scraping framework for extracting structured data from websites. Use when Codex needs to crawl, scrape, or extract data from web pages using AI-powered parsing, handle dynamic content, or work with complex HTML structures.
Crawl4ai 是一个基于人工智能的网络爬取框架,旨在高效地从网站中提取结构化数据。它将传统的 HTML 解析与人工智能相结合,以处理动态内容、智能提取文本,并从复杂的网页中清理和结构化数据。
当 Codex 需要以下操作时使用:
触发短语:
python
from crawl4ai import AsyncWebCrawler, BrowserMode
async def scrape_page(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
browser_mode=BrowserMode.LATEST,
headless=True
)
return result.markdown, result.clean_html
python
from crawl4ai import AsyncWebCrawler, JsonModeScreener
import json
async def extract_products(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
javascript=True,
bypass_cache=True
)
# 提取产品数据
products = []
for item in result.extracted_content:
if item[type] == product:
products.append({
name: item[name],
price: item[price],
url: item[url]
})
return products
场景: 用户想要爬取网站上的所有文章标题。
python
from crawl4ai import AsyncWebCrawler
async def scrape_articles(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
javascript=True,
verbose=True
)
# 从 HTML 中提取文章标题
articles = result.extractedcontent if result.extractedcontent else []
titles = [item.get(name, item.get(text, )) for item in articles]
return titles
触发: 爬取此网站的文章标题 或 从[URL]获取所有标题
场景: 网站通过 JavaScript 加载数据。
python
from crawl4ai import AsyncWebCrawler
async def scrapedynamicsite(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
javascript=True, # 等待 JS 执行
wait_for=body, # 等待特定元素
delay=1.5, # 加载后等待时间
headless=True
)
return result.markdown
触发: 爬取此动态网站 或 此页面需要 JavaScript 加载数据
场景: 提取特定字段,如价格、描述等。
python
from crawl4ai import AsyncWebCrawler
async def extractproductdetails(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
js_code=
const products = document.querySelectorAll(.product);
return Array.from(products).map(p => ({
name: p.querySelector(.name)?.textContent,
price: p.querySelector(.price)?.textContent,
url: p.querySelector(a)?.href
}));
)
return result.extracted_content
触发: 从该页面提取产品详情 或 从[URL]获取价格和名称
场景: 清理杂乱的 HTML 并提取干净的文本。
python
from crawl4ai import AsyncWebCrawler
async def cleanandparse(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
remove_tags=[script, style, nav, footer, header],
onlymaincontent=True
)
# 清理并返回 markdown
cleantext = result.cleanhtml
return clean_text
触发: 清理此 HTML 或 从该页面提取主要内容
python
async def customscrape(url, customjs):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
jscode=customjs,
js_only=True # 仅执行 JS,不下载资源
)
return result.extracted_content
python
from crawl4ai import AsyncWebCrawler
async def multipagescrape(base_url, urls):
async with AsyncWebCrawler() as crawler:
results = []
for url in urls:
result = await crawler.arun(
url=url,
sessionid=fsession{url},
bypass_cache=True
)
results.append({
url: url,
content: result.markdown,
status: result.success
})
return results
python
async def robust_scrape(url):
try:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
timeout=30000 # 30 秒超时
)
if result.success:
return result.markdown, result.extracted_content
else:
print(f爬取失败:{result.error_message})
return None, None
except Exception as e:
print(f爬取错误:{str(e)})
return None, None
Crawl4ai 支持多种输出格式:
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 crawl4ai-1776340222 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 crawl4ai-1776340222 技能
skillhub install crawl4ai-1776340222
文件大小: 16.36 KB | 发布时间: 2026-4-17 14:07