GitHub API call

query_org_repos function

Description

This function pulls raw GitHub repository data for multiple organisations and returns a consolidated DataFrame.

Signature

from typing import Dict
import pandas as pd
import logging

logger = logging.getLogger(__name__)

def query_org_repos(github_org_dict: dict, max_retries: int = 3) -> pd.DataFrame:
    """
    Pulls raw GitHub repository data for multiple organisations and returns a consolidated DataFrame.

    Args:
        github_org_dict (dict): A dictionary containing GitHub organisations to fetch repositories for.
            Values should be organisation names.
        max_retries (int, optional): The maximum number of times to retry the API request if a rate limit is encountered.
            Defaults to 3.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing information about repositories for all specified organisations.
    """
    df = pd.DataFrame()

    for org in github_org_dict.values():
        page = 1
        retries = 0
        while True:
            try:
                raw_data = fetch_public_repos(org, page=page)
                repos_count = len(raw_data)
                logger.info(f"{org} repo count = {repos_count}")

                if repos_count == 0:
                    break

                parsed_data = parse_github_repos(raw_data)
                df = pd.concat([df, parsed_data], axis=0)

                # check if there are more pages
                if repos_count < 100:
                    break
                else:
                    page += 1

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 403:
                    logger.info(f"Rate limit exceeded for organisation {org}.")
                    if retries >= max_retries:
                        logger.info(f"Max retries exceeded for organisation {org}. Moving on.")
                        break
                    reset_time = int(e.response.headers.get("X-RateLimit-Reset"))
                    wait_time = reset_time - time.time() + 1
                    logger.info(f"Waiting {wait_time} seconds until rate limit is reset.")
                    time.sleep(wait_time)
                    retries += 1
                else:
                    print(f"Error fetching data for {org}: {e}")
                    break

    return df

Parameters

  • github_org_dict: A dictionary containing GitHub organisations to fetch repositories for. Values should be organisation names and organisation github names.
  • max_retries: The maximum number of times to retry the API request if a rate limit is encountered. Defaults to 3.

Returns

  • pd.DataFrame: A Pandas DataFrame containing information about repositories for all specified organisations.

Example Usage

from src.ingestion.github_api_call import query_org_repos

df = query_org_repos({'Open AI': 'openai', 'Google': 'google'}, max_retries=3)