Source code for reporoulette

"""RepoRoulette: Randomly Sample GitHub Repositories.

A Python library for randomly sampling GitHub repositories using multiple methods:
- ID-based sampling: Probes random repository IDs
- Temporal sampling: Weighted sampling based on repository activity by time period
- BigQuery sampling: Advanced querying using Google BigQuery's GitHub dataset
- GitHub Archive sampling: Event-based sampling from GitHub Archive files

Example:
    >>> from reporoulette import sample
    >>> results = sample(method='temporal', n_samples=10)
    >>> print(f"Found {len(results['samples'])} repositories")
"""

import importlib.metadata
import logging
import os
from typing import Any

from .samplers.bigquery_sampler import BigQuerySampler
from .samplers.gh_sampler import GHArchiveSampler
from .samplers.id_sampler import IDSampler
from .samplers.temporal_sampler import TemporalSampler

try:
    __version__ = importlib.metadata.version("reporoulette")
except importlib.metadata.PackageNotFoundError:
    # Package is not installed, running from source
    __version__ = "dev"

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)


[docs] def sample( method: str = "temporal", n_samples: int = 50, token: str | None = None, **kwargs: Any, ) -> dict[str, Any]: """Sample repositories using the specified method. Args: method: Sampling method ('id', 'temporal', 'archive', or 'bigquery') n_samples: Number of repositories to sample token: GitHub Personal Access Token (not used for BigQuery) **kwargs: Additional parameters specific to each sampler Returns: Dictionary with sampling results and stats Raises: ValueError: If an unknown sampling method is provided """ # Use environment token if none provided if token is None: token = os.environ.get("GITHUB_TOKEN") # Create the appropriate sampler if method.lower() == "id": sampler = IDSampler(token=token) elif method.lower() == "temporal": sampler = TemporalSampler(token=token) elif method.lower() == "archive": sampler = GHArchiveSampler() elif method.lower() == "bigquery": credentials_path = kwargs.pop( "credentials_path", os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") ) project_id = kwargs.pop("project_id", None) sampler = BigQuerySampler( credentials_path=credentials_path, project_id=project_id ) else: error_msg = f"Unknown sampling method: {method}" logging.error(error_msg) return {"error": error_msg} # Sample repositories results = sampler.sample(n_samples=n_samples, **kwargs) # Return results and stats return { "method": method, "params": kwargs, "attempts": sampler.attempts, "success_rate": sampler.success_rate, "samples": results, }
# Export samplers __all__ = [ "IDSampler", "TemporalSampler", "BigQuerySampler", "GHArchiveSampler", "sample", ]