Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions src/sofifa_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@


class SoFIFAScraper:
def __init__(self, player_urls_file="player_urls.csv", output_file="player_stats.csv"):
def __init__(self, player_urls_file="player_urls.csv", output_file="player_stats.csv", skipped_player_file="player_skipped.csv"):
self.player_urls_file = player_urls_file
self.output_file = output_file
self.skipped_player_file = skipped_player_file
self.player_urls = []
self.player_stats = []
self.columns = None
Expand Down Expand Up @@ -61,6 +62,10 @@ async def scrape_player_stats(self, max_players=None):
# Block images, stylesheets, fonts to optimize loading
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "stylesheet", "font", "media"] else route.continue_())

# Initialize skipped player file with header
with open(self.skipped_player_file, 'w', newline='', encoding='utf-8') as f:
csv.writer(f).writerow(['player_url'])

urls_to_scrape = self.player_urls[:max_players] if max_players else self.player_urls
total = len(urls_to_scrape)

Expand Down Expand Up @@ -106,9 +111,15 @@ async def scrape_player_stats(self, max_players=None):

if not success:
print(f" ✗ Failed after {max_retries} retries, skipping...")
self.save_skipped_player_to_csv(url)

await browser.close()

def save_skipped_player_to_csv(self, url):
"""Save a skipped player URL to the skipped CSV file"""
with open(self.skipped_player_file, 'a', newline='', encoding='utf-8') as f:
csv.writer(f).writerow([url])

def _get_column_order(self, stats_dict):
"""Define and return the column order for CSV"""
priority_cols = [
Expand Down Expand Up @@ -189,6 +200,11 @@ def parse_args():
default="player_stats.csv",
help="Path to the CSV file for saving player stats"
)
parser.add_argument(
"--skipped-player-file",
default="player_skipped.csv",
help="Path to the CSV file for saving skipped player"
)
return parser.parse_args()


Expand All @@ -197,7 +213,8 @@ async def main():
args = parse_args()
scraper = SoFIFAScraper(
player_urls_file=args.player_urls_file,
output_file=args.output_file
output_file=args.output_file,
skipped_player_file=args.skipped_player_file,
)

# Load player URLs from CSV file
Expand All @@ -224,10 +241,15 @@ async def main():
print("\n" + "="*60)
print("SCRAPING COMPLETED!")
print("="*60)
skipped_count = len(scraper.player_urls) - len(scraper.player_stats)
if args.max_players:
skipped_count = args.max_players - len(scraper.player_stats)
print(f"Total player URLs loaded: {len(scraper.player_urls)}")
print(f"Total player stats scraped: {len(scraper.player_stats)}")
print("\nFile created:")
print(" - player_stats.csv (detailed stats for all players)")
print(f"Total player skipped: {skipped_count}")
print("\nFiles created:")
print(f" - {args.output_file} (detailed stats for all players)")
print(f" - {args.skipped_player_file} (skipped player URLs)")

# Show sample of stats columns
if scraper.player_stats:
Expand Down