Skip to content

core

Core functionality of codablellm.

DecompileConfig dataclass

Configuration for decompiling binaries.

Source code in src/codablellm/core/decompiler.py
@dataclass(frozen=True)
class DecompileConfig:
    """
    Configuration for decompiling binaries.
    """

    max_workers: Optional[int] = None
    """
    Maximum number of binaries to decompile in parallel.
    """
    decompiler_args: Sequence[Any] = field(default_factory=list)
    """
    Positional arguments to pass to the decompiler's `__init__` method.
    """
    decompiler_kwargs: Mapping[str, Any] = field(default_factory=dict)
    """
    Keyword arguments to pass to the decompiler's `__init__` method.
    """
    symbol_remover: Optional[SymbolRemovalStrategy] = None
    """
    Optional strategy used to remove symbols from decompiled functions.
    """
    recursive: bool = False
    """
    If True, recursively scan directories for binaries to decompile.
    """
    strict: bool = False
    """
    If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.
    """

    def __post_init__(self) -> None:
        if self.max_workers:
            if self.max_workers < 1:
                raise ValueError("Max workers must be a positive integer")
            os.environ[CODABLELLM_MAX_WORKERS_ENVIRON_KEY] = str(self.max_workers)

decompiler_args = field(default_factory=list) class-attribute instance-attribute

Positional arguments to pass to the decompiler's __init__ method.

decompiler_kwargs = field(default_factory=dict) class-attribute instance-attribute

Keyword arguments to pass to the decompiler's __init__ method.

max_workers = None class-attribute instance-attribute

Maximum number of binaries to decompile in parallel.

recursive = False class-attribute instance-attribute

If True, recursively scan directories for binaries to decompile.

strict = False class-attribute instance-attribute

If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.

symbol_remover = None class-attribute instance-attribute

Optional strategy used to remove symbols from decompiled functions.

DecompiledFunction dataclass

Bases: Function

A decompiled function extracted from a compiled binary file.

Source code in src/codablellm/core/function.py
@dataclass(frozen=True)
class DecompiledFunction(Function):
    """
    A decompiled function extracted from a compiled binary file.
    """

    assembly: str
    """
    Assembly code of the function.
    """
    architecture: str
    """
    The architecture of the binary file from which the function was decompiled.
    """
    address: int
    """
    The starting address of the function in the binary file.
    """

    @deprecated(
        reason="Use DecompileConfig.strip when creating datasets", version="1.2.0"
    )
    def to_stripped(self) -> "DecompiledFunction":
        """
        Creates a stripped version of the decompiled function with anonymized symbol names.

        This method replaces all function symbols in both the function definition and assembly code
        with generated placeholders (e.g., `sub_<uuid>`), ensuring sensitive or original identifiers
        are removed. The resulting `DecompiledFunction` has an updated definition, stripped function name,
        and modified assembly code.

        Returns:
            A new `DecompiledFunction` instance with stripped symbols and updated assembly.
        """
        definition = self.definition
        assembly = self.assembly
        symbol_mapping: Dict[str, str] = {}

        def strip(node: Node) -> str:
            nonlocal symbol_mapping, assembly
            if not node.text:
                raise ValueError(
                    "Expected all function.symbols to have " f"text: {node}"
                )
            orig_function = node.text.decode()
            stripped_symbol = symbol_mapping.setdefault(
                orig_function, f'sub_{str(uuid.uuid4()).split("-", maxsplit=1)[0]}'
            )
            assembly = assembly.replace(orig_function, stripped_symbol)
            return stripped_symbol

        editor = ASTEditor(C_PARSER, definition)
        logger.info(f"Stripping {self.name}...")
        editor.match_and_edit(GET_C_SYMBOLS_QUERY, {"function.symbols": strip})
        definition = editor.source_code
        first_function, *_ = (
            f for f in symbol_mapping.values() if f.startswith("sub_")
        )
        return DecompiledFunction(
            self.uid,
            self.path,
            definition,
            first_function,
            assembly,
            self.architecture,
            self.address,
        )

    def to_json(self) -> DecompiledFunctionJSONObject:
        function_json = super().to_json()
        return {
            "assembly": self.assembly,
            "architecture": self.architecture,
            "address": self.address,
            **function_json,
        }

    @staticmethod
    def create_uid(
        file_path: Path, name: str, _repo_path: Optional[Path] = None
    ) -> str:
        """
        Creates a UID for a function based on its file path and name.

        Parameters:
            file_path: The full file path of the function definition.
            name: The name of the function.

        Returns:
            A UID string in the format: `<file_path>::<function_name>`.
        """
        return f"{file_path}::{name}"

    @classmethod
    def from_json(cls, json_obj: DecompiledFunctionJSONObject) -> "DecompiledFunction":
        function = cls(
            json_obj["uid"],
            Path(json_obj["path"]),
            json_obj["name"],
            json_obj["definition"],
            json_obj["assembly"],
            json_obj["architecture"],
            json_obj["address"],
            _metadata=json_obj["metadata"],
        )
        return function

    @no_type_check
    @classmethod
    def from_decompiled_json(cls, json_obj: JSONObject) -> "DecompiledFunction":
        return cls(
            DecompiledFunction.create_uid(Path(json_obj["path"]), json_obj["name"]),
            Path(json_obj["path"]),
            json_obj["name"],
            json_obj["definition"],
            json_obj["assembly"],
            json_obj["architecture"],
            json_obj["address"],
        )

address instance-attribute

The starting address of the function in the binary file.

architecture instance-attribute

The architecture of the binary file from which the function was decompiled.

assembly instance-attribute

Assembly code of the function.

create_uid(file_path, name, _repo_path=None) staticmethod

Creates a UID for a function based on its file path and name.

Parameters:

Name Type Description Default
file_path Path

The full file path of the function definition.

required
name str

The name of the function.

required

Returns:

Type Description
str

A UID string in the format: <file_path>::<function_name>.

Source code in src/codablellm/core/function.py
@staticmethod
def create_uid(
    file_path: Path, name: str, _repo_path: Optional[Path] = None
) -> str:
    """
    Creates a UID for a function based on its file path and name.

    Parameters:
        file_path: The full file path of the function definition.
        name: The name of the function.

    Returns:
        A UID string in the format: `<file_path>::<function_name>`.
    """
    return f"{file_path}::{name}"

to_stripped()

Creates a stripped version of the decompiled function with anonymized symbol names.

This method replaces all function symbols in both the function definition and assembly code with generated placeholders (e.g., sub_<uuid>), ensuring sensitive or original identifiers are removed. The resulting DecompiledFunction has an updated definition, stripped function name, and modified assembly code.

Returns:

Type Description
DecompiledFunction

A new DecompiledFunction instance with stripped symbols and updated assembly.

Source code in src/codablellm/core/function.py
@deprecated(
    reason="Use DecompileConfig.strip when creating datasets", version="1.2.0"
)
def to_stripped(self) -> "DecompiledFunction":
    """
    Creates a stripped version of the decompiled function with anonymized symbol names.

    This method replaces all function symbols in both the function definition and assembly code
    with generated placeholders (e.g., `sub_<uuid>`), ensuring sensitive or original identifiers
    are removed. The resulting `DecompiledFunction` has an updated definition, stripped function name,
    and modified assembly code.

    Returns:
        A new `DecompiledFunction` instance with stripped symbols and updated assembly.
    """
    definition = self.definition
    assembly = self.assembly
    symbol_mapping: Dict[str, str] = {}

    def strip(node: Node) -> str:
        nonlocal symbol_mapping, assembly
        if not node.text:
            raise ValueError(
                "Expected all function.symbols to have " f"text: {node}"
            )
        orig_function = node.text.decode()
        stripped_symbol = symbol_mapping.setdefault(
            orig_function, f'sub_{str(uuid.uuid4()).split("-", maxsplit=1)[0]}'
        )
        assembly = assembly.replace(orig_function, stripped_symbol)
        return stripped_symbol

    editor = ASTEditor(C_PARSER, definition)
    logger.info(f"Stripping {self.name}...")
    editor.match_and_edit(GET_C_SYMBOLS_QUERY, {"function.symbols": strip})
    definition = editor.source_code
    first_function, *_ = (
        f for f in symbol_mapping.values() if f.startswith("sub_")
    )
    return DecompiledFunction(
        self.uid,
        self.path,
        definition,
        first_function,
        assembly,
        self.architecture,
        self.address,
    )

Decompiler

Bases: ABC

Abstract base class for a decompiler that extracts decompiled functions from compiled binaries.

Source code in src/codablellm/core/decompiler.py
class Decompiler(ABC):
    """
    Abstract base class for a decompiler that extracts decompiled functions from compiled binaries.
    """

    @abstractmethod
    def decompile(self, path: PathLike) -> Sequence[DecompiledFunction]:
        """
        Decompiles a binary and retrieves all decompiled functions contained in it.

        Parameters:
            path: The path to the binary file to be decompiled.

        Returns:
            A sequence of `DecompiledFunction` objects representing the functions extracted from the binary.
        """
        pass

    @abstractmethod
    def get_stripped_function_name(self, address: int) -> str:
        """
        Returns the anonymized name for a function at the given address.

        Parameters:
            address: The memory address of the function.

        Returns:
            A stripped-down or anonymized function name (e.g., `FUN_<addr>`).
        """
        pass

    def decompile_stripped(
        self, path: PathLike, strategy: "SymbolRemovalStrategy"
    ) -> Sequence[DecompiledFunction]:
        """
        Decompiles a binary and applies a symbol removal strategy.

        Parameters:
            path: Path to the binary to decompile.
            strategy: Strategy for symbol removal. Options include "strip" (using the `strip` CLI tool)
                    or "pseudo-strip" (AST-based anonymization of symbols).

        Returns:
            A sequence of `DecompiledFunction` objects with symbol-stripped metadata.
        """
        logger.info(f"Stripping {repr(path)}...")
        if strategy == "strip":
            logger.debug(f"Decompiling {repr(path)} with symbols (pre-strip)...")
            debug_functions = self.decompile(path)

            logger.debug(f"Running `strip` on {repr(path)}...")
            subprocess.run(
                ["strip", str(path)], capture_output=True, text=True, check=True
            )

            logger.debug(f"Decompiling {repr(path)} without symbols (post-strip)...")
            stripped_functions = self.decompile(path)
            stripped_by_addr = {f.address: f for f in stripped_functions}

            # Merge stripped data into metadata of original functions
            combined: List[DecompiledFunction] = []
            for func in debug_functions:
                stripped = stripped_by_addr.get(func.address)
                if stripped:
                    new_metadata = {
                        **func.metadata,
                        "stripped_definition": stripped.definition,
                        "stripped_assembly": stripped.assembly,
                    }
                    combined.append(replace(func, _metadata=new_metadata))
                else:
                    logger.warning(
                        f"No stripped definition found for {func.name} @ {hex(func.address)}"
                    )
                    combined.append(func)  # Fall back to original

            return combined
        logger.debug(f"Utilizing pseudo-strip strategy for {repr(path)}")
        return [pseudo_strip(self, function) for function in self.decompile(path)]

decompile(path) abstractmethod

Decompiles a binary and retrieves all decompiled functions contained in it.

Parameters:

Name Type Description Default
path PathLike

The path to the binary file to be decompiled.

required

Returns:

Type Description
Sequence[DecompiledFunction]

A sequence of DecompiledFunction objects representing the functions extracted from the binary.

Source code in src/codablellm/core/decompiler.py
@abstractmethod
def decompile(self, path: PathLike) -> Sequence[DecompiledFunction]:
    """
    Decompiles a binary and retrieves all decompiled functions contained in it.

    Parameters:
        path: The path to the binary file to be decompiled.

    Returns:
        A sequence of `DecompiledFunction` objects representing the functions extracted from the binary.
    """
    pass

decompile_stripped(path, strategy)

Decompiles a binary and applies a symbol removal strategy.

Parameters:

Name Type Description Default
path PathLike

Path to the binary to decompile.

required
strategy SymbolRemovalStrategy

Strategy for symbol removal. Options include "strip" (using the strip CLI tool) or "pseudo-strip" (AST-based anonymization of symbols).

required

Returns:

Type Description
Sequence[DecompiledFunction]

A sequence of DecompiledFunction objects with symbol-stripped metadata.

Source code in src/codablellm/core/decompiler.py
def decompile_stripped(
    self, path: PathLike, strategy: "SymbolRemovalStrategy"
) -> Sequence[DecompiledFunction]:
    """
    Decompiles a binary and applies a symbol removal strategy.

    Parameters:
        path: Path to the binary to decompile.
        strategy: Strategy for symbol removal. Options include "strip" (using the `strip` CLI tool)
                or "pseudo-strip" (AST-based anonymization of symbols).

    Returns:
        A sequence of `DecompiledFunction` objects with symbol-stripped metadata.
    """
    logger.info(f"Stripping {repr(path)}...")
    if strategy == "strip":
        logger.debug(f"Decompiling {repr(path)} with symbols (pre-strip)...")
        debug_functions = self.decompile(path)

        logger.debug(f"Running `strip` on {repr(path)}...")
        subprocess.run(
            ["strip", str(path)], capture_output=True, text=True, check=True
        )

        logger.debug(f"Decompiling {repr(path)} without symbols (post-strip)...")
        stripped_functions = self.decompile(path)
        stripped_by_addr = {f.address: f for f in stripped_functions}

        # Merge stripped data into metadata of original functions
        combined: List[DecompiledFunction] = []
        for func in debug_functions:
            stripped = stripped_by_addr.get(func.address)
            if stripped:
                new_metadata = {
                    **func.metadata,
                    "stripped_definition": stripped.definition,
                    "stripped_assembly": stripped.assembly,
                }
                combined.append(replace(func, _metadata=new_metadata))
            else:
                logger.warning(
                    f"No stripped definition found for {func.name} @ {hex(func.address)}"
                )
                combined.append(func)  # Fall back to original

        return combined
    logger.debug(f"Utilizing pseudo-strip strategy for {repr(path)}")
    return [pseudo_strip(self, function) for function in self.decompile(path)]

get_stripped_function_name(address) abstractmethod

Returns the anonymized name for a function at the given address.

Parameters:

Name Type Description Default
address int

The memory address of the function.

required

Returns:

Type Description
str

A stripped-down or anonymized function name (e.g., FUN_<addr>).

Source code in src/codablellm/core/decompiler.py
@abstractmethod
def get_stripped_function_name(self, address: int) -> str:
    """
    Returns the anonymized name for a function at the given address.

    Parameters:
        address: The memory address of the function.

    Returns:
        A stripped-down or anonymized function name (e.g., `FUN_<addr>`).
    """
    pass

ExtractConfig dataclass

Configuration for extracting source code functions.

Source code in src/codablellm/core/extractor.py
@dataclass(frozen=True)
class ExtractConfig:
    """
    Configuration for extracting source code functions.
    """

    max_workers: Optional[int] = None
    """
    Maximum number of files to extract functions in parallel.
    """
    accurate_progress: bool = True
    """
    Whether to accurately track progress by counting extractable files in advance. This may take
    longer to start but provides more accurate progress tracking.
    """
    transform: Optional[DynamicSymbol] = None
    """
    An optional transformation to apply to each source code function.
    """
    exclusive_subpaths: Set[Path] = field(default_factory=set)
    """
    A set of subpaths to exclusively extract functions from. If specified, only these subpaths will be extracted.
    """
    exclude_subpaths: Set[Path] = field(default_factory=set)
    """
    A set of subpaths to exclude from extraction. If specified, these subpaths will be ignored.
    """
    checkpoint: int = 10
    """
    The number of steps between saving checkpoints. Set to 0 to disable checkpoints.
    """
    use_checkpoint: bool = True
    """
    `True` if a checkpoint file should be loaded and used to resume extraction.
    """
    extract_as_repo: bool = True
    """
    `True` if the path should be treated as a repository root for calculating relative function scopes.
    """
    extractor_args: Dict[str, Sequence[Any]] = field(default_factory=dict)
    """
    Positional arguments to pass to the extractor's `__init__` method. The keys are language
    names. The values are sequences of arguments. For example, `{'C': [arg1, arg2]}`.
    """
    extractor_kwargs: Dict[str, Dict[str, Any]] = field(default_factory=dict)
    """
    Keyword arguments to pass to the extractor's `__init__` method. The keys are language names.
    The values are dictionaries of keyword arguments. For example, `{'C': {'kwarg1': value1}}`.
    """
    strict: bool = False

    def __post_init__(self) -> None:
        if self.max_workers and self.max_workers < 1:
            raise ValueError("Max workers must be a positive integer")
        if self.exclude_subpaths & self.exclusive_subpaths:
            raise ValueError(
                "Cannot have overlapping paths in exclude_subpaths and "
                "exclusive_subpaths"
            )
        if self.checkpoint < 0:
            raise ValueError("Checkpoint must be a non-negative integer")

    def get_transform(self) -> Optional[Transform]:
        if self.transform:
            return dynamic_import(self.transform)

accurate_progress = True class-attribute instance-attribute

Whether to accurately track progress by counting extractable files in advance. This may take longer to start but provides more accurate progress tracking.

checkpoint = 10 class-attribute instance-attribute

The number of steps between saving checkpoints. Set to 0 to disable checkpoints.

exclude_subpaths = field(default_factory=set) class-attribute instance-attribute

A set of subpaths to exclude from extraction. If specified, these subpaths will be ignored.

exclusive_subpaths = field(default_factory=set) class-attribute instance-attribute

A set of subpaths to exclusively extract functions from. If specified, only these subpaths will be extracted.

extract_as_repo = True class-attribute instance-attribute

True if the path should be treated as a repository root for calculating relative function scopes.

extractor_args = field(default_factory=dict) class-attribute instance-attribute

Positional arguments to pass to the extractor's __init__ method. The keys are language names. The values are sequences of arguments. For example, {'C': [arg1, arg2]}.

extractor_kwargs = field(default_factory=dict) class-attribute instance-attribute

Keyword arguments to pass to the extractor's __init__ method. The keys are language names. The values are dictionaries of keyword arguments. For example, {'C': {'kwarg1': value1}}.

max_workers = None class-attribute instance-attribute

Maximum number of files to extract functions in parallel.

transform = None class-attribute instance-attribute

An optional transformation to apply to each source code function.

use_checkpoint = True class-attribute instance-attribute

True if a checkpoint file should be loaded and used to resume extraction.

Extractor

Bases: ABC

Abstract base class for source code extractors.

Extractors are responsible for parsing source code files and returning extracted function definitions as SourceFunction instances.

Source code in src/codablellm/core/extractor.py
class Extractor(ABC):
    """
    Abstract base class for source code extractors.

    Extractors are responsible for parsing source code files and returning extracted function
    definitions as `SourceFunction` instances.
    """

    @abstractmethod
    def extract(
        self, file_path: PathLike, repo_path: Optional[PathLike] = None
    ) -> Sequence[SourceFunction]:
        """
        Extracts functions from the given source code file.

        Parameters:
            file_path: The path to the source file.
            repo_path: Optional repository root path to calculate relative function scopes.

        Returns:
            A sequence of `SourceFunction` instances extracted from the file.
        """
        pass

    @abstractmethod
    def get_extractable_files(self, path: PathLike) -> Set[Path]:
        """
        Retrieves all files that can be processed by the extractor from the given path.

        Parameters:
            path: A file or directory path to search for extractable files.

        Returns:
            A sequence of `Path` objects representing extractable source files.
        """
        pass

    def is_installed(self) -> bool:
        return True

extract(file_path, repo_path=None) abstractmethod

Extracts functions from the given source code file.

Parameters:

Name Type Description Default
file_path PathLike

The path to the source file.

required
repo_path Optional[PathLike]

Optional repository root path to calculate relative function scopes.

None

Returns:

Type Description
Sequence[SourceFunction]

A sequence of SourceFunction instances extracted from the file.

Source code in src/codablellm/core/extractor.py
@abstractmethod
def extract(
    self, file_path: PathLike, repo_path: Optional[PathLike] = None
) -> Sequence[SourceFunction]:
    """
    Extracts functions from the given source code file.

    Parameters:
        file_path: The path to the source file.
        repo_path: Optional repository root path to calculate relative function scopes.

    Returns:
        A sequence of `SourceFunction` instances extracted from the file.
    """
    pass

get_extractable_files(path) abstractmethod

Retrieves all files that can be processed by the extractor from the given path.

Parameters:

Name Type Description Default
path PathLike

A file or directory path to search for extractable files.

required

Returns:

Type Description
Set[Path]

A sequence of Path objects representing extractable source files.

Source code in src/codablellm/core/extractor.py
@abstractmethod
def get_extractable_files(self, path: PathLike) -> Set[Path]:
    """
    Retrieves all files that can be processed by the extractor from the given path.

    Parameters:
        path: A file or directory path to search for extractable files.

    Returns:
        A sequence of `Path` objects representing extractable source files.
    """
    pass

Function dataclass

Bases: SupportsJSON

Base class for functions used in datasets.

Source code in src/codablellm/core/function.py
@dataclass(frozen=True)
class Function(SupportsJSON):
    """
    Base class for functions used in datasets.
    """

    uid: str
    """
    A unique identifier for the function. This should be unique across all functions in a dataset.
    """
    path: Path
    """
    Absolute path to the file containing the function.
    """
    name: str
    """
    The name of the function.
    """
    definition: str
    """
    The source code of the function.
    """
    _metadata: Mapping[str, Any] = field(default_factory=dict, kw_only=True)

    def __post_init__(self) -> None:
        if not self.path.is_absolute():
            raise ValueError("Path to source code file must be absolute.")

    @property
    def metadata(self) -> Mapping[str, Any]:
        """
        A read-only view of the metadata associated with the function.

        Returns:
            A mapping containing the metadata associated with the function.
        """
        return {k: v for k, v in self._metadata.items()}

    @staticmethod
    def create_uid(file_path: Path, name: str, repo_path: Optional[Path] = None) -> str:
        """
        Creates a unique identifier for a function.

        The UID is constructed based on the function's file path and name. If a repository path is
        provided, the UID uses the file's relative path from the repository root to ensure
        precision across different subdirectories.

        Parameters:
            file_path: The path to the source code file containing the function definition.
            name: The name of the function.
            repo_path: Optional repository root path to calculate the relative file path. If provided, the UID is constructed using the relative path from the repository root.

        Returns:
            A string UID in the format of `<relative_path_or_filename>::<function_name>`.

        Raises:
            ValueError: If the given `file_path` is not a subpath of `repo_path`.
        """
        if repo_path:
            try:
                relative_file_path = repo_path.name / file_path.resolve().relative_to(
                    repo_path.resolve()
                )
                scope = "::".join(relative_file_path.parts)
            except ValueError as e:
                raise ValueError(
                    f'Path to "{file_path.name}" is not in the '
                    f'"{repo_path.name}" repository.'
                ) from e
        else:
            scope = file_path.parts[-1]
        return f"{scope}::{name}"

    @staticmethod
    def get_function_name(uid: str) -> str:
        """
        Extracts the function name from a UID.

        Parameters:
            uid: The unique identifier of the function.

        Returns:
            The function name.
        """
        return uid.split("::")[-1]

    def to_json(self) -> FunctionJSONObject:
        return {
            "definition": self.definition,
            "metadata": dict(self.metadata),
            "name": self.name,
            "path": str(self.path),
            "uid": self.uid,
        }

    @classmethod
    def from_json(cls, json_obj: FunctionJSONObject) -> "Function":
        function = cls(
            json_obj["uid"],
            Path(json_obj["path"]),
            json_obj["name"],
            json_obj["definition"],
            _metadata=json_obj["metadata"],
        )
        return function

definition instance-attribute

The source code of the function.

metadata property

A read-only view of the metadata associated with the function.

Returns:

Type Description
Mapping[str, Any]

A mapping containing the metadata associated with the function.

name instance-attribute

The name of the function.

path instance-attribute

Absolute path to the file containing the function.

uid instance-attribute

A unique identifier for the function. This should be unique across all functions in a dataset.

create_uid(file_path, name, repo_path=None) staticmethod

Creates a unique identifier for a function.

The UID is constructed based on the function's file path and name. If a repository path is provided, the UID uses the file's relative path from the repository root to ensure precision across different subdirectories.

Parameters:

Name Type Description Default
file_path Path

The path to the source code file containing the function definition.

required
name str

The name of the function.

required
repo_path Optional[Path]

Optional repository root path to calculate the relative file path. If provided, the UID is constructed using the relative path from the repository root.

None

Returns:

Type Description
str

A string UID in the format of <relative_path_or_filename>::<function_name>.

Raises:

Type Description
ValueError

If the given file_path is not a subpath of repo_path.

Source code in src/codablellm/core/function.py
@staticmethod
def create_uid(file_path: Path, name: str, repo_path: Optional[Path] = None) -> str:
    """
    Creates a unique identifier for a function.

    The UID is constructed based on the function's file path and name. If a repository path is
    provided, the UID uses the file's relative path from the repository root to ensure
    precision across different subdirectories.

    Parameters:
        file_path: The path to the source code file containing the function definition.
        name: The name of the function.
        repo_path: Optional repository root path to calculate the relative file path. If provided, the UID is constructed using the relative path from the repository root.

    Returns:
        A string UID in the format of `<relative_path_or_filename>::<function_name>`.

    Raises:
        ValueError: If the given `file_path` is not a subpath of `repo_path`.
    """
    if repo_path:
        try:
            relative_file_path = repo_path.name / file_path.resolve().relative_to(
                repo_path.resolve()
            )
            scope = "::".join(relative_file_path.parts)
        except ValueError as e:
            raise ValueError(
                f'Path to "{file_path.name}" is not in the '
                f'"{repo_path.name}" repository.'
            ) from e
    else:
        scope = file_path.parts[-1]
    return f"{scope}::{name}"

get_function_name(uid) staticmethod

Extracts the function name from a UID.

Parameters:

Name Type Description Default
uid str

The unique identifier of the function.

required

Returns:

Type Description
str

The function name.

Source code in src/codablellm/core/function.py
@staticmethod
def get_function_name(uid: str) -> str:
    """
    Extracts the function name from a UID.

    Parameters:
        uid: The unique identifier of the function.

    Returns:
        The function name.
    """
    return uid.split("::")[-1]

SourceFunction dataclass

Bases: Function

A subroutine extracted from source code.

Source code in src/codablellm/core/function.py
@dataclass(frozen=True)
class SourceFunction(Function):
    """
    A subroutine extracted from source code.
    """

    language: str
    """
    The programming language of the source code.
    """
    start_byte: int
    """
    The starting byte offset of the function definition in the source code file.
    """
    end_byte: int
    """
    The ending byte offset of the function definition in the source code file.
    """
    class_name: Optional[str] = None
    """
    The name of the class containing the function, if applicable.
    """

    def __post_init__(self) -> None:
        if self.start_byte < 0:
            raise ValueError("Start byte must be a non-negative integer")
        if self.start_byte > self.end_byte:
            raise ValueError("Start byte must be less than end byte")

    @property
    def is_method(self) -> bool:
        """
        Indicates whether the function is a method of a class.

        Returns:
            `True` if the function is defined within a class.
        """
        return self.class_name is not None

    def with_definition(
        self,
        definition: str,
        name: Optional[str] = None,
        write_back: bool = True,
        metadata: Mapping[str, Any] = {},
    ) -> "SourceFunction":
        """
        Creates a new `SourceFunction` instance with an updated definition and optional new name.

        This method generates a new UID if a new name is provided, merges existing and new metadata,
        and optionally writes the updated definition back to the source file.

        Parameters:
            definition: The new function definition to use.
            name: Optional new function name. If not provided, retains the current function name and UID.
            write_back: If `True`, writes the updated definition to the original source file.
            metadata: Additional metadata to merge with the existing function metadata.

        Returns:
            A new `SourceFunction` instance with the updated definition and metadata.
        """
        if not name:
            name = self.name
            uid = self.uid
        else:
            uid = SourceFunction.create_uid(self.path, name, class_name=self.class_name)
            scope, _ = self.uid.rsplit("::", maxsplit=1)
            uid = f"{scope}::{uid}"
        source_function = SourceFunction(
            uid,
            self.path,
            name,
            definition,
            self.language,
            self.start_byte,
            self.start_byte + len(definition),
            class_name=self.class_name,
            _metadata={**metadata, **self.metadata},
        )
        if write_back:
            logger.debug(
                "Writing back modified definition to " f"{source_function.path.name}..."
            )
            modified_code = source_function.path.read_text().replace(
                self.definition, definition
            )
            source_function.path.write_text(modified_code)
        return source_function

    def to_json(self) -> SourceFunctionJSONObject:
        function_json = super().to_json()
        return {
            "language": self.language,
            "start_byte": self.start_byte,
            "end_byte": self.end_byte,
            "class_name": self.class_name,
            **function_json,
        }

    @staticmethod
    def create_uid(
        file_path: Path,
        name: str,
        repo_path: Optional[Path] = None,
        class_name: Optional[str] = None,
    ) -> str:
        """
        Creates a unique identifier (UID) for a source function to be used as a dataset key.

        The UID is based on the function's file path and name, and optionally includes the class name
        if the function is a method. If a repository path is provided, the UID uses the relative file path.

        Parameters:
            file_path: The full file path of the function definition.
            name: The name of the function.
            repo_path: Optional repository root path for relative path calculation.
            class_name: The class name to include in the UID if the function is a method.

        Returns:
            A UID string in the format: `<relative_path_or_filename>::<class_name>::<function_name>` if `class_name` is provided, otherwise `<relative_path_or_filename>::<function_name>`.
        """
        uid = Function.create_uid(file_path, name, repo_path=repo_path)
        if class_name:
            scope, function = uid.rsplit("::", maxsplit=1)
            uid = f"{scope}::{class_name}::{function}"
        return uid

    @staticmethod
    def get_function_name(uid: str) -> str:
        """
        Extracts the function name from a UID, removing any class name prefix.

        Parameters:
            uid: The unique identifier of the function.

        Returns:
            The function name.
        """
        return Function.get_function_name(uid).rsplit("::", maxsplit=1)[-1]

    @classmethod
    def from_json(cls, json_obj: SourceFunctionJSONObject) -> "SourceFunction":
        function = cls(
            json_obj["uid"],
            Path(json_obj["path"]),
            json_obj["name"],
            json_obj["definition"],
            json_obj["language"],
            json_obj["start_byte"],
            json_obj["end_byte"],
            json_obj["class_name"],
            _metadata=json_obj["metadata"],
        )
        return function

    @classmethod
    def from_source(
        cls,
        file_path: Path,
        language: str,
        definition: str,
        name: str,
        start_byte: int,
        end_byte: int,
        class_name: Optional[str] = None,
        repo_path: Optional[Path] = None,
        metadata: Mapping[str, Any] = {},
    ) -> "SourceFunction":
        """
        Creates a `SourceFunction` instance from source code information.

        Parameters:
            file_path: The file path where the function is defined.
            language: The programming language of the function.
            definition: The full source code of the function definition.
            name: The function name.
            start_byte: The starting byte offset of the function in the source file.
            end_byte: The ending byte offset of the function in the source file.
            class_name: Optional name of the class containing the function.
            repo_path: Optional repository root path for relative UID creation.
            metadata: Additional metadata to associate with the function.

        Returns:
            A `SourceFunction` instance populated with the given information and metadata.
        """
        function = cls(
            SourceFunction.create_uid(
                file_path, name, repo_path=repo_path, class_name=class_name
            ),
            file_path,
            name,
            definition,
            language,
            start_byte,
            end_byte,
            class_name=class_name,
            _metadata=metadata,
        )
        return function

class_name = None class-attribute instance-attribute

The name of the class containing the function, if applicable.

end_byte instance-attribute

The ending byte offset of the function definition in the source code file.

is_method property

Indicates whether the function is a method of a class.

Returns:

Type Description
bool

True if the function is defined within a class.

language instance-attribute

The programming language of the source code.

start_byte instance-attribute

The starting byte offset of the function definition in the source code file.

create_uid(file_path, name, repo_path=None, class_name=None) staticmethod

Creates a unique identifier (UID) for a source function to be used as a dataset key.

The UID is based on the function's file path and name, and optionally includes the class name if the function is a method. If a repository path is provided, the UID uses the relative file path.

Parameters:

Name Type Description Default
file_path Path

The full file path of the function definition.

required
name str

The name of the function.

required
repo_path Optional[Path]

Optional repository root path for relative path calculation.

None
class_name Optional[str]

The class name to include in the UID if the function is a method.

None

Returns:

Type Description
str

A UID string in the format: <relative_path_or_filename>::<class_name>::<function_name> if class_name is provided, otherwise <relative_path_or_filename>::<function_name>.

Source code in src/codablellm/core/function.py
@staticmethod
def create_uid(
    file_path: Path,
    name: str,
    repo_path: Optional[Path] = None,
    class_name: Optional[str] = None,
) -> str:
    """
    Creates a unique identifier (UID) for a source function to be used as a dataset key.

    The UID is based on the function's file path and name, and optionally includes the class name
    if the function is a method. If a repository path is provided, the UID uses the relative file path.

    Parameters:
        file_path: The full file path of the function definition.
        name: The name of the function.
        repo_path: Optional repository root path for relative path calculation.
        class_name: The class name to include in the UID if the function is a method.

    Returns:
        A UID string in the format: `<relative_path_or_filename>::<class_name>::<function_name>` if `class_name` is provided, otherwise `<relative_path_or_filename>::<function_name>`.
    """
    uid = Function.create_uid(file_path, name, repo_path=repo_path)
    if class_name:
        scope, function = uid.rsplit("::", maxsplit=1)
        uid = f"{scope}::{class_name}::{function}"
    return uid

from_source(file_path, language, definition, name, start_byte, end_byte, class_name=None, repo_path=None, metadata={}) classmethod

Creates a SourceFunction instance from source code information.

Parameters:

Name Type Description Default
file_path Path

The file path where the function is defined.

required
language str

The programming language of the function.

required
definition str

The full source code of the function definition.

required
name str

The function name.

required
start_byte int

The starting byte offset of the function in the source file.

required
end_byte int

The ending byte offset of the function in the source file.

required
class_name Optional[str]

Optional name of the class containing the function.

None
repo_path Optional[Path]

Optional repository root path for relative UID creation.

None
metadata Mapping[str, Any]

Additional metadata to associate with the function.

{}

Returns:

Type Description
SourceFunction

A SourceFunction instance populated with the given information and metadata.

Source code in src/codablellm/core/function.py
@classmethod
def from_source(
    cls,
    file_path: Path,
    language: str,
    definition: str,
    name: str,
    start_byte: int,
    end_byte: int,
    class_name: Optional[str] = None,
    repo_path: Optional[Path] = None,
    metadata: Mapping[str, Any] = {},
) -> "SourceFunction":
    """
    Creates a `SourceFunction` instance from source code information.

    Parameters:
        file_path: The file path where the function is defined.
        language: The programming language of the function.
        definition: The full source code of the function definition.
        name: The function name.
        start_byte: The starting byte offset of the function in the source file.
        end_byte: The ending byte offset of the function in the source file.
        class_name: Optional name of the class containing the function.
        repo_path: Optional repository root path for relative UID creation.
        metadata: Additional metadata to associate with the function.

    Returns:
        A `SourceFunction` instance populated with the given information and metadata.
    """
    function = cls(
        SourceFunction.create_uid(
            file_path, name, repo_path=repo_path, class_name=class_name
        ),
        file_path,
        name,
        definition,
        language,
        start_byte,
        end_byte,
        class_name=class_name,
        _metadata=metadata,
    )
    return function

get_function_name(uid) staticmethod

Extracts the function name from a UID, removing any class name prefix.

Parameters:

Name Type Description Default
uid str

The unique identifier of the function.

required

Returns:

Type Description
str

The function name.

Source code in src/codablellm/core/function.py
@staticmethod
def get_function_name(uid: str) -> str:
    """
    Extracts the function name from a UID, removing any class name prefix.

    Parameters:
        uid: The unique identifier of the function.

    Returns:
        The function name.
    """
    return Function.get_function_name(uid).rsplit("::", maxsplit=1)[-1]

with_definition(definition, name=None, write_back=True, metadata={})

Creates a new SourceFunction instance with an updated definition and optional new name.

This method generates a new UID if a new name is provided, merges existing and new metadata, and optionally writes the updated definition back to the source file.

Parameters:

Name Type Description Default
definition str

The new function definition to use.

required
name Optional[str]

Optional new function name. If not provided, retains the current function name and UID.

None
write_back bool

If True, writes the updated definition to the original source file.

True
metadata Mapping[str, Any]

Additional metadata to merge with the existing function metadata.

{}

Returns:

Type Description
SourceFunction

A new SourceFunction instance with the updated definition and metadata.

Source code in src/codablellm/core/function.py
def with_definition(
    self,
    definition: str,
    name: Optional[str] = None,
    write_back: bool = True,
    metadata: Mapping[str, Any] = {},
) -> "SourceFunction":
    """
    Creates a new `SourceFunction` instance with an updated definition and optional new name.

    This method generates a new UID if a new name is provided, merges existing and new metadata,
    and optionally writes the updated definition back to the source file.

    Parameters:
        definition: The new function definition to use.
        name: Optional new function name. If not provided, retains the current function name and UID.
        write_back: If `True`, writes the updated definition to the original source file.
        metadata: Additional metadata to merge with the existing function metadata.

    Returns:
        A new `SourceFunction` instance with the updated definition and metadata.
    """
    if not name:
        name = self.name
        uid = self.uid
    else:
        uid = SourceFunction.create_uid(self.path, name, class_name=self.class_name)
        scope, _ = self.uid.rsplit("::", maxsplit=1)
        uid = f"{scope}::{uid}"
    source_function = SourceFunction(
        uid,
        self.path,
        name,
        definition,
        self.language,
        self.start_byte,
        self.start_byte + len(definition),
        class_name=self.class_name,
        _metadata={**metadata, **self.metadata},
    )
    if write_back:
        logger.debug(
            "Writing back modified definition to " f"{source_function.path.name}..."
        )
        modified_code = source_function.path.read_text().replace(
            self.definition, definition
        )
        source_function.path.write_text(modified_code)
    return source_function