-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathcrawl_with_formats.py
More file actions
45 lines (39 loc) · 1.21 KB
/
crawl_with_formats.py
File metadata and controls
45 lines (39 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from dotenv import load_dotenv
load_dotenv()
import time
from scrapegraph_py import (
ScrapeGraphAI,
CrawlRequest,
MarkdownFormatConfig,
LinksFormatConfig,
)
sgai = ScrapeGraphAI()
start_res = sgai.crawl.start(CrawlRequest(
url="https://scrapegraphai.com/",
max_pages=3,
max_depth=1,
formats=[
MarkdownFormatConfig(),
LinksFormatConfig(),
],
))
if start_res.status != "success" or not start_res.data:
print("Failed to start:", start_res.error)
else:
crawl_id = start_res.data.id
print("Crawl started:", crawl_id)
status = start_res.data.status
while status == "running":
time.sleep(2)
get_res = sgai.crawl.get(crawl_id)
if get_res.status != "success" or not get_res.data:
print("Failed to get status:", get_res.error)
break
status = get_res.data.status
print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")
if status in ("completed", "failed"):
print("\nPages crawled:")
for page in get_res.data.pages:
print(f"\n Page: {page.url}")
print(f" Status: {page.status}")
print(f" Depth: {page.depth}")