|
| 1 | +// LICENSE_CODE ZON |
| 2 | +'use strict'; /*jslint node:true es9:true*/ |
| 3 | + |
| 4 | +const web_scraping_strategy = { |
| 5 | + name: 'web_scraping_strategy', |
| 6 | + description: 'Decision tree for picking the right Bright Data tool. ' |
| 7 | + +'Invoke at the start of any scraping session to learn the correct ' |
| 8 | + +'tool selection order ' |
| 9 | + +'(dataset tools -> Web Unlocker -> Browser API).', |
| 10 | + arguments: [], |
| 11 | + load: ()=>'You have access to Bright Data tools at three tiers of cost' |
| 12 | + +' and capability.\n' |
| 13 | + +'Always follow this order -- do not skip ahead:\n' |
| 14 | + +'\nSTEP 1 -- Check for a dedicated dataset tool (fastest, cheapest):' |
| 15 | + +'\n Look at the URL. If it matches a known platform, use the' |
| 16 | + +' corresponding web_data_* tool:' |
| 17 | + +'\n - Amazon product page (/dp/) -> web_data_amazon_product' |
| 18 | + +'\n - Amazon search results' |
| 19 | + +' -> web_data_amazon_product_search' |
| 20 | + +'\n - LinkedIn profile' |
| 21 | + +' -> web_data_linkedin_person_profile' |
| 22 | + +'\n - LinkedIn company' |
| 23 | + +' -> web_data_linkedin_company_profile' |
| 24 | + +'\n - Instagram profile/post/reel' |
| 25 | + +' -> web_data_instagram_profiles / _posts / _reels' |
| 26 | + +'\n - TikTok profile/post' |
| 27 | + +' -> web_data_tiktok_profiles / _posts' |
| 28 | + +'\n - YouTube video/channel' |
| 29 | + +' -> web_data_youtube_videos / _profiles' |
| 30 | + +'\n - Reddit post -> web_data_reddit_posts' |
| 31 | + +'\n - X (Twitter) post -> web_data_x_posts' |
| 32 | + +'\n - Zillow listing' |
| 33 | + +' -> web_data_zillow_properties_listing' |
| 34 | + +'\n - Booking.com hotel' |
| 35 | + +' -> web_data_booking_hotel_listings' |
| 36 | + +'\n - GitHub file' |
| 37 | + +' -> web_data_github_repository_file' |
| 38 | + +'\n - Google Maps reviews -> web_data_google_maps_reviews' |
| 39 | + +'\n - Google Shopping -> web_data_google_shopping' |
| 40 | + +'\n - (and more -- check all web_data_* tools before proceeding)' |
| 41 | + +'\n\nSTEP 2 -- If no dataset tool matches, use scrape_as_markdown' |
| 42 | + +' (default):' |
| 43 | + +'\n This handles anti-bot protection and CAPTCHA automatically.' |
| 44 | + +'\n Retry once if the first attempt returns empty or blocked' |
| 45 | + +' content.' |
| 46 | + +'\n\nSTEP 3 -- If scrape_as_markdown fails twice, escalate to' |
| 47 | + +' scraping_browser_navigate:' |
| 48 | + +'\n Use ONLY when the page requires JavaScript execution,' |
| 49 | + +' user interaction' |
| 50 | + +'\n (clicking, form submission), or dynamic content loading.' |
| 51 | + +'\n This is slower and more expensive' |
| 52 | + +' -- do not use as a first attempt.' |
| 53 | + +'\n\nNEVER use the browser tools for sites' |
| 54 | + +' scrape_as_markdown can handle.' |
| 55 | + +'\nNEVER use scrape_as_markdown when a web_data_* tool matches' |
| 56 | + +' the URL pattern.', |
| 57 | +}; |
| 58 | + |
| 59 | +const diagnose_scraping_approach = { |
| 60 | + name: 'diagnose_scraping_approach', |
| 61 | + description: 'Run a two-step diagnostic to discover the correct ' |
| 62 | + +'Bright Data product for a new website. Tries Web Unlocker first, ' |
| 63 | + +'then Browser API, then reports which succeeded.', |
| 64 | + arguments: [], |
| 65 | + load: ()=>'To discover the correct Bright Data product for a new' |
| 66 | + +' website, run this diagnostic:\n' |
| 67 | + +'\n1. Try scrape_as_markdown on the target URL.' |
| 68 | + +'\n - If it returns useful content' |
| 69 | + +' -> Web Unlocker is the correct integration. Stop.' |
| 70 | + +'\n - If it returns empty, blocked, or low-quality content' |
| 71 | + +' -> continue to step 2.' |
| 72 | + +'\n\n2. Try scraping_browser_navigate + scraping_browser_snapshot' |
| 73 | + +' on the same URL.' |
| 74 | + +'\n - If it returns useful content' |
| 75 | + +' -> Browser API is the correct integration. Stop.' |
| 76 | + +'\n - If both fail -> report to the user that the target may' |
| 77 | + +' require a specialized' |
| 78 | + +'\n Bright Data product' |
| 79 | + +' (SERP API, specific dataset tool, or custom configuration).' |
| 80 | + +'\n\nReport which approach succeeded and recommend it as the' |
| 81 | + +' integration method.' |
| 82 | + +'\nDo not proceed with data extraction until the diagnostic' |
| 83 | + +' is complete.', |
| 84 | +}; |
| 85 | + |
| 86 | +const prompts = [web_scraping_strategy, diagnose_scraping_approach]; |
| 87 | + |
| 88 | +export default prompts; |
0 commit comments