From Track to Table: Extracting data

Import Modules


import datetime
import logging
import os

import fastf1
import fastf1.plotting
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from IPython.display import Markdown, display
from google import genai

Set up Logging


# Set up logging to both console and file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("driver_info.log"),
        logging.StreamHandler()
    ]
)


# Suppress fastf1 INFO messages
logging.getLogger("fastf1").setLevel(logging.WARNING)

Set the race year


# Set race_year
race_year = 2025
save_path = '/content/drive/MyDrive/F1 API/'

Create function to get racing schedule

racing_schedulehas a list of all the races in a year, we do not need to extract information for races that have not yet occured. This method takes all the data and not just the delta. If there were significantly more data then the delta method would be used


def get_racing_schedule(race_year: int) -> pd.DataFrame:
  """
  Fetch the racing schedule for a given year, add metadata, and save it to a CSV file.

  Parameters
  ----------
  race_year : int
      The year for which to fetch the racing schedule.

  Returns
  -------
  pd.DataFrame
      A DataFrame containing the racing schedule for the specified year with an additional
      column '_meta_extract_time' indicating when the data was extracted.

  Raises
  ------
  Exception
      If there is an error fetching the schedule from the FastF1 API.

  Side Effects
  ------------
  - Saves the schedule as a CSV file named '{race_year}_racing_schedule.csv'.
  - Logs information about the fetching process, metadata addition, and file saving.
  """

  logging.info(f"Fetching racing schedule for year: {race_year}")

  # Get the event schedule for the season
  try:
      schedule = fastf1.get_event_schedule(race_year)
      logging.info(f"Successfully retrieved {len(schedule)} events for {race_year}")
  except Exception as e:
      logging.error(f"Error fetching schedule for {race_year}: {e}")
      raise

  # Add meta data column
  extract_time = datetime.datetime.now(datetime.UTC)
  schedule['_meta_extract_time'] = extract_time
  logging.info(f"Added metadata column '_meta_extract_time' with value: {extract_time}")


  # Save to storage
  file_name = f'{save_path}{race_year}_racing_schedule.csv'
  schedule.to_csv(file_name, index = False)
  logging.info(f"File saved '{file_name}'")


  return schedule

Getting the data for 2025


racing_schedule = get_racing_schedule(race_year=race_year)

Getting Driver and Team Information

Over the course of a race season the number of teams does not change and drivers of the teams rarely changes. These fields have been treated as static for the qualifiying analysis. If the changes were more frequent then these can be treated as slowly changing dimension (SCD), and versioning could be implemented.


def get_drivers_and_teams(race_year):
    """
    Retrieve the drivers and teams participating in a Formula 1 season for a given year.

    This function uses the `fastf1` package to fetch the official event schedule for the
    specified race year. It then iterates over completed events (up to the current date),
    loads the race session data, and extracts driver and team details. Driver information
    is aggregated across all completed races to account for mid-season changes, while team
    information is extracted once since it is stable throughout the season.

    Args:
        race_year (int): The Formula 1 season year to fetch data for.

    Returns:
        dict: A dictionary containing:
            - "drivers" (pd.DataFrame): DataFrame of unique drivers with fields:
                ['DriverNumber', 'BroadcastName', 'Abbreviation', 'DriverId',
                 'TeamId', 'FirstName', 'LastName', 'FullName', 'HeadshotUrl'].
            - "teams" (pd.DataFrame): DataFrame of unique teams with fields:
                ['TeamId', 'TeamName', 'TeamColor'].

    Notes:
        - Only completed events (EventDate <= current UTC time) are processed.
        - If a driver changes teams mid-season, only the last occurrence is kept.
        - Any failed session loads are skipped with a warning.
        - Requires `fastf1` and `pandas`.

    Raises:
        Exception: If fetching session data fails, the specific event is skipped
                   and the exception is logged.
    """

    # Create empty df for drivers
    drivers = pd.DataFrame()

    # List key fields that need to be extracted - This can be done more dynamically if need on a larger scale using a table-config file
    driver_fields = ['DriverNumber' , 'BroadcastName', 'Abbreviation', 'DriverId', 'TeamId', 'FirstName', 'LastName' ,'FullName', 'HeadshotUrl']
    team_fields = ['TeamId','TeamName', 'TeamColor']

    # Get the racing schedule
    racing_schedule = fastf1.get_event_schedule(race_year)
    logging.info(f"Successfully retrieved {len(racing_schedule)} events for {race_year}")

    # Loop through the races
    for i, row in racing_schedule.iterrows():

      # Only extract historic races - ignoring changes between drivers between sprint and race
      if row['EventDate'] <= datetime.datetime.utcnow():

        logging.info(f"Loading session for RoundNumber {row['RoundNumber']} - OfficialEventName {row['OfficialEventName']}")
        try:
            # Load session data
            session = fastf1.get_session(race_year, row['RoundNumber'], 'R')
            session.load()
            logging.info(f"Session loaded for {row['RoundNumber']} - Event_name {row['OfficialEventName']}")


            # Get Driver information
            logging.info(f"Getting Drivers for {row['RoundNumber']} - Event_name {row['OfficialEventName']}")
            driver_info = session.results[driver_fields]
            drivers = pd.concat([drivers, driver_info], ignore_index=True)



        except Exception as e:
            print(f"Skipping round {row['RoundNumber']} due to error: {e}")

      else:
        logging.info(f"Skipping RoundNumber {row['RoundNumber']} - OfficialEventName {row['OfficialEventName']} - Event has not happened yet")

      print(f'{i} of {len(racing_schedule)}')


    # Driver line up can vary within a season, but a driver is unlikely to change teams mid season.
    # If this does occur with more frequency then an SCD can be used
    logging.info(f"Dropping duplicate drivers")
    before = len(drivers)
    drivers = drivers.drop_duplicates(subset=['DriverNumber'], keep="last").reset_index(drop=True)
    after = len(drivers)
    logging.info(f"Dropped rows : Rows before:{before}, Rows after: {after}, Rows dropped {before - after}")

    # Team information only needs to be extracted once. Teams exist for the whole season so the session object can be used at the end, rather than multiple times in the for loop
    logging.info(f"Getting team information")
    team_info = session.results[team_fields]

    # Each team has two drivers, so dropping dupes
    team_info = team_info.drop_duplicates(keep="last").reset_index(drop=True)

    # Output drivers and team dfs
    output = {
        'drivers': drivers,
        'teams': team_info
    }


    return output
output = get_drivers_and_teams(race_year = race_year)

This data was then saved, to avoid exhausting the API. In this project, the files were saved locally, in a production setting with repeated calls to the API, this data can be stored in simple storage like Azure Blob Storage or S3 buckets, before being brought into a system like a database.

Identifying drivers

Each driver can be identified by a three letter code, this typically easier to identify and communicate. The dataset is small and the data will not be joined extensively, so the driver abbreviation can be used. For larger datasets, with complex joins, a integer join key would be used. A dictionary was created to get the colours for the respective drivers. Each driver has a linestyle key value pair. The solid line is for the "main driver", and dashed for the second driver. When there have been more than two drivers other linestyles have been chosen.


# Create driver dict to display drivers in the respective team colours
# Could have used team code here and pointed to the teams df, but didnt think of that
driver_colour_dict  ={
    'NOR' : {'color': '#F47600', 'linestyle': 'solid'},
    'PIA' : {'color': '#F47600', 'linestyle': 'dashed'},

    'HUL' : {'color': '#01C00E', 'linestyle': 'solid'},
    'BOR' : {'color': '#01C00E', 'linestyle': 'dashed'},

    'HAM' : {'color': '#ED1131', 'linestyle': 'dashed'},
    'LEC' : {'color': '#ED1131', 'linestyle': 'solid'},

    'VER' : {'color': '#4781D7', 'linestyle': 'solid'},
    'TSU' : {'color': '#4781D7', 'linestyle': 'dashed'},

    'GAS' : {'color': '#00A1E8', 'linestyle': 'solid'},
    'DOO' : {'color': '#00A1E8', 'linestyle': 'dotted'},
    'COL' : {'color': '#00A1E8', 'linestyle': 'dashed'},

    'STR' : {'color': '#229971', 'linestyle': 'dashed'},
    'ALO' : {'color': '#229971', 'linestyle': 'solid'},

    'ALB' : {'color': '#1868DB', 'linestyle': 'solid'},
    'SAI' : {'color': '#1868DB', 'linestyle': 'dashed'},

    'RUS' : {'color': '#00D7B6', 'linestyle': 'solid'},
    'ANT' : {'color': '#00D7B6', 'linestyle': 'dashed'},

    'BEA' : {'color': '#9C9FA2', 'linestyle': 'dashed'},
    'OCO' : {'color': '#9C9FA2', 'linestyle': 'solid'},

    'HAD' : {'color': '#6C98FF', 'linestyle': 'solid'},
    'LAW' : {'color': '#6C98FF', 'linestyle': 'dashed'}

}

Home
Back