From Track to Table: Extracting data
Import Modules
import datetime
import logging
import os
import fastf1
import fastf1.plotting
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display
from google import genai
Set up Logging
# Set up logging to both console and file
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("driver_info.log"),
logging.StreamHandler()
]
)
# Suppress fastf1 INFO messages
logging.getLogger("fastf1").setLevel(logging.WARNING)
Set the race year
# Set race_year
race_year = 2025
save_path = '/content/drive/MyDrive/F1 API/'
Create function to get racing schedule
racing_schedulehas a list of all the races in a year, we do not need to extract information for races
that have not yet occured. This method takes all the data and not just the delta.
If there were significantly more data then the delta method would be used
def get_racing_schedule(race_year: int) -> pd.DataFrame:
"""
Fetch the racing schedule for a given year, add metadata, and save it to a CSV file.
Parameters
----------
race_year : int
The year for which to fetch the racing schedule.
Returns
-------
pd.DataFrame
A DataFrame containing the racing schedule for the specified year with an additional
column '_meta_extract_time' indicating when the data was extracted.
Raises
------
Exception
If there is an error fetching the schedule from the FastF1 API.
Side Effects
------------
- Saves the schedule as a CSV file named '{race_year}_racing_schedule.csv'.
- Logs information about the fetching process, metadata addition, and file saving.
"""
logging.info(f"Fetching racing schedule for year: {race_year}")
# Get the event schedule for the season
try:
schedule = fastf1.get_event_schedule(race_year)
logging.info(f"Successfully retrieved {len(schedule)} events for {race_year}")
except Exception as e:
logging.error(f"Error fetching schedule for {race_year}: {e}")
raise
# Add meta data column
extract_time = datetime.datetime.now(datetime.UTC)
schedule['_meta_extract_time'] = extract_time
logging.info(f"Added metadata column '_meta_extract_time' with value: {extract_time}")
# Save to storage
file_name = f'{save_path}{race_year}_racing_schedule.csv'
schedule.to_csv(file_name, index = False)
logging.info(f"File saved '{file_name}'")
return schedule
Getting the data for 2025
racing_schedule = get_racing_schedule(race_year=race_year)
Getting Driver and Team Information
Over the course of a race season the number of teams does not change and drivers of the teams rarely changes. These fields have been treated as static for the qualifiying analysis. If the changes were more frequent then these can be treated as slowly changing dimension (SCD), and versioning could be implemented.
def get_drivers_and_teams(race_year):
"""
Retrieve the drivers and teams participating in a Formula 1 season for a given year.
This function uses the `fastf1` package to fetch the official event schedule for the
specified race year. It then iterates over completed events (up to the current date),
loads the race session data, and extracts driver and team details. Driver information
is aggregated across all completed races to account for mid-season changes, while team
information is extracted once since it is stable throughout the season.
Args:
race_year (int): The Formula 1 season year to fetch data for.
Returns:
dict: A dictionary containing:
- "drivers" (pd.DataFrame): DataFrame of unique drivers with fields:
['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId',
'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl'].
- "teams" (pd.DataFrame): DataFrame of unique teams with fields:
['TeamId', 'TeamName', 'TeamColor'].
Notes:
- Only completed events (EventDate <= current UTC time) are processed.
- If a driver changes teams mid-season, only the last occurrence is kept.
- Any failed session loads are skipped with a warning.
- Requires `fastf1` and `pandas`.
Raises:
Exception: If fetching session data fails, the specific event is skipped
and the exception is logged.
"""
# Create empty df for drivers
drivers = pd.DataFrame()
# List key fields that need to be extracted - This can be done more dynamically if need on a larger scale using a table-config file
driver_fields = ['DriverNumber' , 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamId', 'FirstName', 'LastName' ,'FullName', 'HeadshotUrl']
team_fields = ['TeamId','TeamName', 'TeamColor']
# Get the racing schedule
racing_schedule = fastf1.get_event_schedule(race_year)
logging.info(f"Successfully retrieved {len(racing_schedule)} events for {race_year}")
# Loop through the races
for i, row in racing_schedule.iterrows():
# Only extract historic races - ignoring changes between drivers between sprint and race
if row['EventDate'] <= datetime.datetime.utcnow():
logging.info(f"Loading session for RoundNumber {row['RoundNumber']} - OfficialEventName {row['OfficialEventName']}")
try:
# Load session data
session = fastf1.get_session(race_year, row['RoundNumber'], 'R')
session.load()
logging.info(f"Session loaded for {row['RoundNumber']} - Event_name {row['OfficialEventName']}")
# Get Driver information
logging.info(f"Getting Drivers for {row['RoundNumber']} - Event_name {row['OfficialEventName']}")
driver_info = session.results[driver_fields]
drivers = pd.concat([drivers, driver_info], ignore_index=True)
except Exception as e:
print(f"Skipping round {row['RoundNumber']} due to error: {e}")
else:
logging.info(f"Skipping RoundNumber {row['RoundNumber']} - OfficialEventName {row['OfficialEventName']} - Event has not happened yet")
print(f'{i} of {len(racing_schedule)}')
# Driver line up can vary within a season, but a driver is unlikely to change teams mid season.
# If this does occur with more frequency then an SCD can be used
logging.info(f"Dropping duplicate drivers")
before = len(drivers)
drivers = drivers.drop_duplicates(subset=['DriverNumber'], keep="last").reset_index(drop=True)
after = len(drivers)
logging.info(f"Dropped rows : Rows before:{before}, Rows after: {after}, Rows dropped {before - after}")
# Team information only needs to be extracted once. Teams exist for the whole season so the session object can be used at the end, rather than multiple times in the for loop
logging.info(f"Getting team information")
team_info = session.results[team_fields]
# Each team has two drivers, so dropping dupes
team_info = team_info.drop_duplicates(keep="last").reset_index(drop=True)
# Output drivers and team dfs
output = {
'drivers': drivers,
'teams': team_info
}
return output
output = get_drivers_and_teams(race_year = race_year)
This data was then saved, to avoid exhausting the API. In this project, the files were saved locally, in a production setting with repeated calls to the API, this data can be stored in simple storage like Azure Blob Storage or S3 buckets, before being brought into a system like a database.
Identifying drivers
Each driver can be identified by a three letter code, this typically easier to
identify and communicate. The dataset is small and the data will not be joined
extensively, so the driver abbreviation can be used. For larger datasets, with complex
joins, a integer join key would be used. A dictionary was created to get the colours for the
respective drivers. Each driver has a linestyle key value pair. The solid
line is for the "main driver", and dashed for the second driver. When there have been more
than two drivers other linestyles have been chosen.
# Create driver dict to display drivers in the respective team colours
# Could have used team code here and pointed to the teams df, but didnt think of that
driver_colour_dict ={
'NOR' : {'color': '#F47600', 'linestyle': 'solid'},
'PIA' : {'color': '#F47600', 'linestyle': 'dashed'},
'HUL' : {'color': '#01C00E', 'linestyle': 'solid'},
'BOR' : {'color': '#01C00E', 'linestyle': 'dashed'},
'HAM' : {'color': '#ED1131', 'linestyle': 'dashed'},
'LEC' : {'color': '#ED1131', 'linestyle': 'solid'},
'VER' : {'color': '#4781D7', 'linestyle': 'solid'},
'TSU' : {'color': '#4781D7', 'linestyle': 'dashed'},
'GAS' : {'color': '#00A1E8', 'linestyle': 'solid'},
'DOO' : {'color': '#00A1E8', 'linestyle': 'dotted'},
'COL' : {'color': '#00A1E8', 'linestyle': 'dashed'},
'STR' : {'color': '#229971', 'linestyle': 'dashed'},
'ALO' : {'color': '#229971', 'linestyle': 'solid'},
'ALB' : {'color': '#1868DB', 'linestyle': 'solid'},
'SAI' : {'color': '#1868DB', 'linestyle': 'dashed'},
'RUS' : {'color': '#00D7B6', 'linestyle': 'solid'},
'ANT' : {'color': '#00D7B6', 'linestyle': 'dashed'},
'BEA' : {'color': '#9C9FA2', 'linestyle': 'dashed'},
'OCO' : {'color': '#9C9FA2', 'linestyle': 'solid'},
'HAD' : {'color': '#6C98FF', 'linestyle': 'solid'},
'LAW' : {'color': '#6C98FF', 'linestyle': 'dashed'}
}