Skip to content

codablellm

codablellm is a framework for creating and curating high-quality code datasets tailored for large language models

DecompileConfig dataclass

Configuration for decompiling binaries.

Source code in src/codablellm/core/decompiler.py
@dataclass(frozen=True)
class DecompileConfig:
    """
    Configuration for decompiling binaries.
    """

    max_workers: Optional[int] = None
    """
    Maximum number of binaries to decompile in parallel.
    """
    decompiler_args: Sequence[Any] = field(default_factory=list)
    """
    Positional arguments to pass to the decompiler's `__init__` method.
    """
    decompiler_kwargs: Mapping[str, Any] = field(default_factory=dict)
    """
    Keyword arguments to pass to the decompiler's `__init__` method.
    """
    symbol_remover: Optional[SymbolRemovalStrategy] = None
    """
    Optional strategy used to remove symbols from decompiled functions.
    """
    recursive: bool = False
    """
    If True, recursively scan directories for binaries to decompile.
    """
    strict: bool = False
    """
    If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.
    """

    def __post_init__(self) -> None:
        if self.max_workers:
            if self.max_workers < 1:
                raise ValueError("Max workers must be a positive integer")
            os.environ[CODABLELLM_MAX_WORKERS_ENVIRON_KEY] = str(self.max_workers)

decompiler_args = field(default_factory=list) class-attribute instance-attribute

Positional arguments to pass to the decompiler's __init__ method.

decompiler_kwargs = field(default_factory=dict) class-attribute instance-attribute

Keyword arguments to pass to the decompiler's __init__ method.

max_workers = None class-attribute instance-attribute

Maximum number of binaries to decompile in parallel.

recursive = False class-attribute instance-attribute

If True, recursively scan directories for binaries to decompile.

strict = False class-attribute instance-attribute

If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.

symbol_remover = None class-attribute instance-attribute

Optional strategy used to remove symbols from decompiled functions.

DecompiledCodeDatasetConfig dataclass

Configuration options for generating a decompiled dataset.

This class defines the settings for extracting source code functions from binaries and configuring the decompilation process.

Source code in src/codablellm/dataset.py
@dataclass(frozen=True)
class DecompiledCodeDatasetConfig:
    """
    Configuration options for generating a decompiled dataset.

    This class defines the settings for extracting source code functions from binaries
    and configuring the decompilation process.
    """

    extract_config: extractor.ExtractConfig = field(
        default_factory=extractor.ExtractConfig
    )
    """
    Configuration settings for extracting source code functions.
    """
    decompiler_config: decompiler.DecompileConfig = field(
        default_factory=decompiler.DecompileConfig
    )
    """
    Configuration settings for decompiling binaries.
    """
    mapper: utils.DynamicSymbol = DEFAULT_MAPPER
    """
    The mapping function used to determine if a decompiled function corresponds to a given source function.
    """

    def get_mapper(self) -> Mapper:
        return utils.dynamic_import(self.mapper)

decompiler_config = field(default_factory=(decompiler.DecompileConfig)) class-attribute instance-attribute

Configuration settings for decompiling binaries.

extract_config = field(default_factory=(extractor.ExtractConfig)) class-attribute instance-attribute

Configuration settings for extracting source code functions.

mapper = DEFAULT_MAPPER class-attribute instance-attribute

The mapping function used to determine if a decompiled function corresponds to a given source function.

ExtractConfig dataclass

Configuration for extracting source code functions.

Source code in src/codablellm/core/extractor.py
@dataclass(frozen=True)
class ExtractConfig:
    """
    Configuration for extracting source code functions.
    """

    max_workers: Optional[int] = None
    """
    Maximum number of files to extract functions in parallel.
    """
    accurate_progress: bool = True
    """
    Whether to accurately track progress by counting extractable files in advance. This may take
    longer to start but provides more accurate progress tracking.
    """
    transform: Optional[DynamicSymbol] = None
    """
    An optional transformation to apply to each source code function.
    """
    exclusive_subpaths: Set[Path] = field(default_factory=set)
    """
    A set of subpaths to exclusively extract functions from. If specified, only these subpaths will be extracted.
    """
    exclude_subpaths: Set[Path] = field(default_factory=set)
    """
    A set of subpaths to exclude from extraction. If specified, these subpaths will be ignored.
    """
    checkpoint: int = 10
    """
    The number of steps between saving checkpoints. Set to 0 to disable checkpoints.
    """
    use_checkpoint: bool = True
    """
    `True` if a checkpoint file should be loaded and used to resume extraction.
    """
    extract_as_repo: bool = True
    """
    `True` if the path should be treated as a repository root for calculating relative function scopes.
    """
    extractor_args: Dict[str, Sequence[Any]] = field(default_factory=dict)
    """
    Positional arguments to pass to the extractor's `__init__` method. The keys are language
    names. The values are sequences of arguments. For example, `{'C': [arg1, arg2]}`.
    """
    extractor_kwargs: Dict[str, Dict[str, Any]] = field(default_factory=dict)
    """
    Keyword arguments to pass to the extractor's `__init__` method. The keys are language names.
    The values are dictionaries of keyword arguments. For example, `{'C': {'kwarg1': value1}}`.
    """
    strict: bool = False

    def __post_init__(self) -> None:
        if self.max_workers and self.max_workers < 1:
            raise ValueError("Max workers must be a positive integer")
        if self.exclude_subpaths & self.exclusive_subpaths:
            raise ValueError(
                "Cannot have overlapping paths in exclude_subpaths and "
                "exclusive_subpaths"
            )
        if self.checkpoint < 0:
            raise ValueError("Checkpoint must be a non-negative integer")

    def get_transform(self) -> Optional[Transform]:
        if self.transform:
            return dynamic_import(self.transform)

accurate_progress = True class-attribute instance-attribute

Whether to accurately track progress by counting extractable files in advance. This may take longer to start but provides more accurate progress tracking.

checkpoint = 10 class-attribute instance-attribute

The number of steps between saving checkpoints. Set to 0 to disable checkpoints.

exclude_subpaths = field(default_factory=set) class-attribute instance-attribute

A set of subpaths to exclude from extraction. If specified, these subpaths will be ignored.

exclusive_subpaths = field(default_factory=set) class-attribute instance-attribute

A set of subpaths to exclusively extract functions from. If specified, only these subpaths will be extracted.

extract_as_repo = True class-attribute instance-attribute

True if the path should be treated as a repository root for calculating relative function scopes.

extractor_args = field(default_factory=dict) class-attribute instance-attribute

Positional arguments to pass to the extractor's __init__ method. The keys are language names. The values are sequences of arguments. For example, {'C': [arg1, arg2]}.

extractor_kwargs = field(default_factory=dict) class-attribute instance-attribute

Keyword arguments to pass to the extractor's __init__ method. The keys are language names. The values are dictionaries of keyword arguments. For example, {'C': {'kwarg1': value1}}.

max_workers = None class-attribute instance-attribute

Maximum number of files to extract functions in parallel.

transform = None class-attribute instance-attribute

An optional transformation to apply to each source code function.

use_checkpoint = True class-attribute instance-attribute

True if a checkpoint file should be loaded and used to resume extraction.

ManageConfig dataclass

Configuration settings for managing a built local repository.

Source code in src/codablellm/repoman.py
@dataclass(frozen=True)
class ManageConfig:
    """
    Configuration settings for managing a built local repository.
    """

    cleanup_command: Optional[utils.Command] = None
    """
    An optional CLI command to clean up the build artifacts of the repository.
    """
    build_error_handling: utils.CommandErrorHandler = "interactive"
    """
    Specifies how to handle errors during the build process.
    """
    cleanup_error_handling: utils.CommandErrorHandler = "ignore"
    """
    Specifies how to handle errors during the cleanup process, if `cleanup_command` is provided.
    """
    show_progress: Optional[bool] = None
    """
    Indicates whether to display a progress bar during both the build and cleanup processes. 
    """
    run_from: Literal["cwd", "repo"] = "repo"
    """'
    Specifies the working directory from which to run build and clean commands.

    - `repo`: Use the root of the repository as the working directory. This may refer to the original
    repository path or a duplicated temporary copy depending on the generation mode.
    - `cwd`: Use the current working directory at the time the command is run.

    This option controls how relative paths within commands are resolved and can affect the behavior
    of tools that assume a specific project root.
    """
    extra_paths: Sequence[utils.PathLike] = field(default_factory=list)

build_error_handling = 'interactive' class-attribute instance-attribute

Specifies how to handle errors during the build process.

cleanup_command = None class-attribute instance-attribute

An optional CLI command to clean up the build artifacts of the repository.

cleanup_error_handling = 'ignore' class-attribute instance-attribute

Specifies how to handle errors during the cleanup process, if cleanup_command is provided.

run_from = 'repo' class-attribute instance-attribute

' Specifies the working directory from which to run build and clean commands.

  • repo: Use the root of the repository as the working directory. This may refer to the original repository path or a duplicated temporary copy depending on the generation mode.
  • cwd: Use the current working directory at the time the command is run.

This option controls how relative paths within commands are resolved and can affect the behavior of tools that assume a specific project root.

show_progress = None class-attribute instance-attribute

Indicates whether to display a progress bar during both the build and cleanup processes.

SourceCodeDatasetConfig dataclass

Configuration options for generating a source code dataset.

This class provides flexible options for controlling how a source code dataset is generated, including handling of temporary directories, extraction settings, and generation modes.

Source code in src/codablellm/dataset.py
@dataclass
class SourceCodeDatasetConfig:
    """
    Configuration options for generating a source code dataset.

    This class provides flexible options for controlling how a source code dataset is generated,
    including handling of temporary directories, extraction settings, and generation modes.
    """

    generation_mode: DatasetGenerationMode = "temp"
    """
    How the source code dataset should be generated.
    """
    delete_temp: bool = True
    """
    Controls whether the temporary directory should be deleted after dataset generation.

    - *Applies only if `generation_mode` is set to `temp`. When set to `True`, 
    the temporary directory will be automatically deleted after dataset generation.*
    """
    extract_config: extractor.ExtractConfig = field(
        default_factory=extractor.ExtractConfig
    )
    """
    Configuration settings for extracting source code functions.
    """
    log_generation_warning: bool = True

    def __post_init__(self) -> None:
        if (
            self.generation_mode == "temp" or self.generation_mode == "temp-append"
        ) and not self.extract_config.transform:
            if self.log_generation_warning:
                logger.warning(
                    f'Generation mode was specified as "{self.generation_mode}", but no '
                    'transform was provided. Changing generation mode to "path" to '
                    "save resources"
                )
            self.generation_mode = "path"

delete_temp = True class-attribute instance-attribute

Controls whether the temporary directory should be deleted after dataset generation.

  • Applies only if generation_mode is set to temp. When set to True, the temporary directory will be automatically deleted after dataset generation.

extract_config = field(default_factory=(extractor.ExtractConfig)) class-attribute instance-attribute

Configuration settings for extracting source code functions.

generation_mode = 'temp' class-attribute instance-attribute

How the source code dataset should be generated.