GitHub API call
query_org_repos
function
Description
This function pulls raw GitHub repository data for multiple organisations and returns a consolidated DataFrame.
Signature
from typing import Dict
import pandas as pd
import logging
= logging.getLogger(__name__)
logger
def query_org_repos(github_org_dict: dict, max_retries: int = 3) -> pd.DataFrame:
"""
Pulls raw GitHub repository data for multiple organisations and returns a consolidated DataFrame.
Args:
github_org_dict (dict): A dictionary containing GitHub organisations to fetch repositories for.
Values should be organisation names.
max_retries (int, optional): The maximum number of times to retry the API request if a rate limit is encountered.
Defaults to 3.
Returns:
pd.DataFrame: A Pandas DataFrame containing information about repositories for all specified organisations.
"""
= pd.DataFrame()
df
for org in github_org_dict.values():
= 1
page = 0
retries while True:
try:
= fetch_public_repos(org, page=page)
raw_data = len(raw_data)
repos_count f"{org} repo count = {repos_count}")
logger.info(
if repos_count == 0:
break
= parse_github_repos(raw_data)
parsed_data = pd.concat([df, parsed_data], axis=0)
df
# check if there are more pages
if repos_count < 100:
break
else:
+= 1
page
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
f"Rate limit exceeded for organisation {org}.")
logger.info(if retries >= max_retries:
f"Max retries exceeded for organisation {org}. Moving on.")
logger.info(break
= int(e.response.headers.get("X-RateLimit-Reset"))
reset_time = reset_time - time.time() + 1
wait_time f"Waiting {wait_time} seconds until rate limit is reset.")
logger.info(
time.sleep(wait_time)+= 1
retries else:
print(f"Error fetching data for {org}: {e}")
break
return df
Parameters
- github_org_dict: A dictionary containing GitHub organisations to fetch repositories for. Values should be organisation names and organisation github names.
- max_retries: The maximum number of times to retry the API request if a rate limit is encountered. Defaults to 3.
Returns
- pd.DataFrame: A Pandas DataFrame containing information about repositories for all specified organisations.
Example Usage
from src.ingestion.github_api_call import query_org_repos
= query_org_repos({'Open AI': 'openai', 'Google': 'google'}, max_retries=3) df