Skip to content

decompiler

Module containing functions for decompiling binaries.

This module manages decompiler registration and configuration, allowing codablellm to use different backends for binary decompilation.

GET_C_SYMBOLS_QUERY = '(function_definition declarator: (function_declarator declarator: (identifier) @function.symbols ))(call_expression function: (identifier) @function.symbols)' module-attribute

Tree-sitter query used to extract all C symbols from a function definition.

DecompileConfig dataclass

Configuration for decompiling binaries.

Source code in src/codablellm/core/decompiler.py
@dataclass(frozen=True)
class DecompileConfig:
    """
    Configuration for decompiling binaries.
    """

    max_workers: Optional[int] = None
    """
    Maximum number of binaries to decompile in parallel.
    """
    decompiler_args: Sequence[Any] = field(default_factory=list)
    """
    Positional arguments to pass to the decompiler's `__init__` method.
    """
    decompiler_kwargs: Mapping[str, Any] = field(default_factory=dict)
    """
    Keyword arguments to pass to the decompiler's `__init__` method.
    """
    symbol_remover: Optional[SymbolRemovalStrategy] = None
    """
    Optional strategy used to remove symbols from decompiled functions.
    """
    recursive: bool = False
    """
    If True, recursively scan directories for binaries to decompile.
    """
    strict: bool = False
    """
    If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.
    """

    def __post_init__(self) -> None:
        if self.max_workers:
            if self.max_workers < 1:
                raise ValueError("Max workers must be a positive integer")
            os.environ[CODABLELLM_MAX_WORKERS_ENVIRON_KEY] = str(self.max_workers)

decompiler_args = field(default_factory=list) class-attribute instance-attribute

Positional arguments to pass to the decompiler's __init__ method.

decompiler_kwargs = field(default_factory=dict) class-attribute instance-attribute

Keyword arguments to pass to the decompiler's __init__ method.

max_workers = None class-attribute instance-attribute

Maximum number of binaries to decompile in parallel.

recursive = False class-attribute instance-attribute

If True, recursively scan directories for binaries to decompile.

strict = False class-attribute instance-attribute

If True, raise exceptions on decompilation failures; otherwise, continue and log warnings.

symbol_remover = None class-attribute instance-attribute

Optional strategy used to remove symbols from decompiled functions.

Decompiler

Bases: ABC

Abstract base class for a decompiler that extracts decompiled functions from compiled binaries.

Source code in src/codablellm/core/decompiler.py
class Decompiler(ABC):
    """
    Abstract base class for a decompiler that extracts decompiled functions from compiled binaries.
    """

    @abstractmethod
    def decompile(self, path: PathLike) -> Sequence[DecompiledFunction]:
        """
        Decompiles a binary and retrieves all decompiled functions contained in it.

        Parameters:
            path: The path to the binary file to be decompiled.

        Returns:
            A sequence of `DecompiledFunction` objects representing the functions extracted from the binary.
        """
        pass

    @abstractmethod
    def get_stripped_function_name(self, address: int) -> str:
        """
        Returns the anonymized name for a function at the given address.

        Parameters:
            address: The memory address of the function.

        Returns:
            A stripped-down or anonymized function name (e.g., `FUN_<addr>`).
        """
        pass

    def decompile_stripped(
        self, path: PathLike, strategy: "SymbolRemovalStrategy"
    ) -> Sequence[DecompiledFunction]:
        """
        Decompiles a binary and applies a symbol removal strategy.

        Parameters:
            path: Path to the binary to decompile.
            strategy: Strategy for symbol removal. Options include "strip" (using the `strip` CLI tool)
                    or "pseudo-strip" (AST-based anonymization of symbols).

        Returns:
            A sequence of `DecompiledFunction` objects with symbol-stripped metadata.
        """
        logger.info(f"Stripping {repr(path)}...")
        if strategy == "strip":
            logger.debug(f"Decompiling {repr(path)} with symbols (pre-strip)...")
            debug_functions = self.decompile(path)

            logger.debug(f"Running `strip` on {repr(path)}...")
            subprocess.run(
                ["strip", str(path)], capture_output=True, text=True, check=True
            )

            logger.debug(f"Decompiling {repr(path)} without symbols (post-strip)...")
            stripped_functions = self.decompile(path)
            stripped_by_addr = {f.address: f for f in stripped_functions}

            # Merge stripped data into metadata of original functions
            combined: List[DecompiledFunction] = []
            for func in debug_functions:
                stripped = stripped_by_addr.get(func.address)
                if stripped:
                    new_metadata = {
                        **func.metadata,
                        "stripped_definition": stripped.definition,
                        "stripped_assembly": stripped.assembly,
                    }
                    combined.append(replace(func, _metadata=new_metadata))
                else:
                    logger.warning(
                        f"No stripped definition found for {func.name} @ {hex(func.address)}"
                    )
                    combined.append(func)  # Fall back to original

            return combined
        logger.debug(f"Utilizing pseudo-strip strategy for {repr(path)}")
        return [pseudo_strip(self, function) for function in self.decompile(path)]

decompile(path) abstractmethod

Decompiles a binary and retrieves all decompiled functions contained in it.

Parameters:

Name Type Description Default
path PathLike

The path to the binary file to be decompiled.

required

Returns:

Type Description
Sequence[DecompiledFunction]

A sequence of DecompiledFunction objects representing the functions extracted from the binary.

Source code in src/codablellm/core/decompiler.py
@abstractmethod
def decompile(self, path: PathLike) -> Sequence[DecompiledFunction]:
    """
    Decompiles a binary and retrieves all decompiled functions contained in it.

    Parameters:
        path: The path to the binary file to be decompiled.

    Returns:
        A sequence of `DecompiledFunction` objects representing the functions extracted from the binary.
    """
    pass

decompile_stripped(path, strategy)

Decompiles a binary and applies a symbol removal strategy.

Parameters:

Name Type Description Default
path PathLike

Path to the binary to decompile.

required
strategy SymbolRemovalStrategy

Strategy for symbol removal. Options include "strip" (using the strip CLI tool) or "pseudo-strip" (AST-based anonymization of symbols).

required

Returns:

Type Description
Sequence[DecompiledFunction]

A sequence of DecompiledFunction objects with symbol-stripped metadata.

Source code in src/codablellm/core/decompiler.py
def decompile_stripped(
    self, path: PathLike, strategy: "SymbolRemovalStrategy"
) -> Sequence[DecompiledFunction]:
    """
    Decompiles a binary and applies a symbol removal strategy.

    Parameters:
        path: Path to the binary to decompile.
        strategy: Strategy for symbol removal. Options include "strip" (using the `strip` CLI tool)
                or "pseudo-strip" (AST-based anonymization of symbols).

    Returns:
        A sequence of `DecompiledFunction` objects with symbol-stripped metadata.
    """
    logger.info(f"Stripping {repr(path)}...")
    if strategy == "strip":
        logger.debug(f"Decompiling {repr(path)} with symbols (pre-strip)...")
        debug_functions = self.decompile(path)

        logger.debug(f"Running `strip` on {repr(path)}...")
        subprocess.run(
            ["strip", str(path)], capture_output=True, text=True, check=True
        )

        logger.debug(f"Decompiling {repr(path)} without symbols (post-strip)...")
        stripped_functions = self.decompile(path)
        stripped_by_addr = {f.address: f for f in stripped_functions}

        # Merge stripped data into metadata of original functions
        combined: List[DecompiledFunction] = []
        for func in debug_functions:
            stripped = stripped_by_addr.get(func.address)
            if stripped:
                new_metadata = {
                    **func.metadata,
                    "stripped_definition": stripped.definition,
                    "stripped_assembly": stripped.assembly,
                }
                combined.append(replace(func, _metadata=new_metadata))
            else:
                logger.warning(
                    f"No stripped definition found for {func.name} @ {hex(func.address)}"
                )
                combined.append(func)  # Fall back to original

        return combined
    logger.debug(f"Utilizing pseudo-strip strategy for {repr(path)}")
    return [pseudo_strip(self, function) for function in self.decompile(path)]

get_stripped_function_name(address) abstractmethod

Returns the anonymized name for a function at the given address.

Parameters:

Name Type Description Default
address int

The memory address of the function.

required

Returns:

Type Description
str

A stripped-down or anonymized function name (e.g., FUN_<addr>).

Source code in src/codablellm/core/decompiler.py
@abstractmethod
def get_stripped_function_name(self, address: int) -> str:
    """
    Returns the anonymized name for a function at the given address.

    Parameters:
        address: The memory address of the function.

    Returns:
        A stripped-down or anonymized function name (e.g., `FUN_<addr>`).
    """
    pass

create_decompiler(*args, **kwargs)

Initializes an instance of the decompiler that is being used by codablellm.

Parameters:

Name Type Description Default
*args Any

Positional arguments to pass to the decompiler's __init__ method.

()
**kwargs Any

Keyword arguments to pass to the decompiler's __init__ method.

{}

Returns:

Type Description
Decompiler

An instance of the specified Decompiler subclass.

Raises:

Type Description
DecompilerNotFound

If the specified decompiler cannot be imported or if the class cannot be found.

Source code in src/codablellm/core/decompiler.py
def create_decompiler(*args: Any, **kwargs: Any) -> Decompiler:
    """
    Initializes an instance of the decompiler that is being used by `codablellm`.

    Parameters:
        *args: Positional arguments to pass to the decompiler's `__init__` method.
        **kwargs: Keyword arguments to pass to the decompiler's `__init__` method.

    Returns:
        An instance of the specified `Decompiler` subclass.

    Raises:
        DecompilerNotFound: If the specified decompiler cannot be imported or if the class cannot be found.
    """
    decompiler_class: Type[Decompiler] = dynamic_import(_decompiler.symbol)
    return decompiler_class(*args, **kwargs)

decompile(*paths, config, as_flow=True)

Decompiles one or more binaries.

Parameters:

Name Type Description Default
paths PathLike

One or more paths pointing to binary files or directories.

()
config DecompileConfig

A DecompileConfig instance specifying options for decompilation.

required

Returns:

Type Description
List[DecompiledFunction]

A list of all decompiled functions from the provided binaries.

Source code in src/codablellm/core/decompiler.py
def decompile(
    *paths: PathLike, config: DecompileConfig, as_flow: bool = True
) -> List[DecompiledFunction]:
    """
    Decompiles one or more binaries.

    Parameters:
        paths: One or more paths pointing to binary files or directories.
        config: A `DecompileConfig` instance specifying options for decompilation.

    Returns:
        A list of all decompiled functions from the provided binaries.
    """
    flow = codablellm_flow if as_flow else lambda: lambda x: x.fn
    return flow()(decompile_bins_task)(*paths, config=config)

decompile_bins_task(*paths, config)

Decompiles binaries and extracts decompiled functions from the given path or list of paths.

Parameters:

Name Type Description Default
paths PathLike

A single path or sequence of paths pointing to binary files or directories containing binaries.

()
config DecompileConfig

Decompilation configuration options.

required
as_callable_pool

If True, returns a callable pool for deferred execution, typically used for progress bar handling or asynchronous processing.

required

Returns:

Type Description
List[DecompiledFunction]

Either a list of DecompiledFunction instances or a _CallableDecompiler for deferred execution.

Source code in src/codablellm/core/decompiler.py
@codablellm_task(name="decompile_bins")
def decompile_bins_task(
    *paths: PathLike, config: DecompileConfig
) -> List[DecompiledFunction]:
    """
    Decompiles binaries and extracts decompiled functions from the given path or list of paths.

    Parameters:
        paths: A single path or sequence of paths pointing to binary files or directories containing binaries.
        config: Decompilation configuration options.
        as_callable_pool: If `True`, returns a callable pool for deferred execution, typically used for progress bar handling or asynchronous processing.

    Returns:
        Either a list of `DecompiledFunction` instances or a `_CallableDecompiler` for deferred execution.
    """
    bins: List[Path] = []
    # Collect binary files
    for path in paths:
        path = Path(path)
        try:
            path.mkdir(parents=True, exist_ok=True)
        except FileExistsError:
            # In case the path is not a directory, continue
            pass
        # If a path is a directory, glob all child binaries
        glob = path.rglob if config.recursive else path.glob
        bins.extend([b for b in glob("*") if is_binary(b)] if path.is_dir() else [path])
    if not any(bins):
        logger.warning("No binaries found to decompile")
    # Create decompiler
    decompiler = create_decompiler(*config.decompiler_args, **config.decompiler_kwargs)
    # Submit decompile tasks
    logger.info(f"Submitting {get().name} decompile tasks...")
    futures = [
        decompile_task.submit(decompiler, bin, config.symbol_remover, return_state=True)
        for bin in bins
    ]
    results = [future.result(raise_on_failure=config.strict) for future in futures]
    functions: List[DecompiledFunction] = []
    for result in results:
        if isinstance(result, list):
            functions.extend(result)
    logger.info(f"Successfully decompiled {len(functions)} functions")
    return functions

decompile_task(decompiler, path, symbol_remover)

Prefect task for decompiling a single binary file using the specified decompiler.

Parameters:

Name Type Description Default
decompiler Decompiler

An instance of a Decompiler.

required
path PathLike

Path to the binary to decompile.

required
symbol_remover Optional[SymbolRemovalStrategy]

Optional symbol removal strategy to apply.

required

Returns:

Type Description
Sequence[DecompiledFunction]

A list of DecompiledFunction instances extracted from the binary.

Source code in src/codablellm/core/decompiler.py
@codablellm_low_level_task(name="decompile")
def decompile_task(
    decompiler: Decompiler,
    path: PathLike,
    symbol_remover: Optional[SymbolRemovalStrategy],
) -> Sequence[DecompiledFunction]:
    """
    Prefect task for decompiling a single binary file using the specified decompiler.

    Parameters:
        decompiler: An instance of a `Decompiler`.
        path: Path to the binary to decompile.
        symbol_remover: Optional symbol removal strategy to apply.

    Returns:
        A list of `DecompiledFunction` instances extracted from the binary.
    """
    if symbol_remover:
        return decompiler.decompile_stripped(path, symbol_remover)
    return decompiler.decompile(path)

get()

Returns the currently registered decompiler.

Returns:

Type Description
RegisteredDecompiler

A RegisteredDecompiler tuple containing the name and dynamic symbol of the active decompiler.

Source code in src/codablellm/core/decompiler.py
def get() -> RegisteredDecompiler:
    """
    Returns the currently registered decompiler.

    Returns:
        A `RegisteredDecompiler` tuple containing the name and dynamic symbol of the active decompiler.
    """
    return _decompiler

pseudo_strip(decompiler, function)

Creates a stripped version of the decompiled function with anonymized symbol names.

This method replaces all function symbols in both the function definition and assembly code with generated placeholders (e.g., FUN_<addr> or sub_<addr>), ensuring sensitive or original identifiers are removed. The resulting DecompiledFunction has an updated definition, stripped function name, and modified assembly code.

Parameters:

Name Type Description Default
decompiler Decompiler

The Decompiler instance to use for generating anonymized symbol names.

required
function DecompiledFunction

The DecompiledFunction to be stripped of its original symbols.

required

Returns:

Type Description
DecompiledFunction

A new DecompiledFunction instance with stripped symbols and updated assembly.

Source code in src/codablellm/core/decompiler.py
def pseudo_strip(
    decompiler: "Decompiler", function: DecompiledFunction
) -> "DecompiledFunction":
    """
    Creates a stripped version of the decompiled function with anonymized symbol names.

    This method replaces all function symbols in both the function definition and assembly code
    with generated placeholders (e.g., `FUN_<addr>` or `sub_<addr>`), ensuring sensitive or original
    identifiers are removed. The resulting `DecompiledFunction` has an updated definition,
    stripped function name, and modified assembly code.

    Parameters:
        decompiler: The `Decompiler` instance to use for generating anonymized symbol names.
        function: The `DecompiledFunction` to be stripped of its original symbols.

    Returns:
        A new `DecompiledFunction` instance with stripped symbols and updated assembly.
    """
    definition = function.definition
    assembly = function.assembly
    symbol_mapping: Dict[str, str] = {}

    def anonymize_symbol(node: Node) -> str:
        nonlocal symbol_mapping, assembly
        if not node.text:
            raise ValueError(f"Expected all function.symbols to have text: {node}")
        orig_function = node.text.decode()
        stripped_symbol = symbol_mapping.setdefault(
            orig_function, decompiler.get_stripped_function_name(function.address)
        )
        assembly = assembly.replace(orig_function, stripped_symbol)
        return stripped_symbol

    editor = ASTEditor(CExtractor.PARSER, definition)
    editor.match_and_edit(GET_C_SYMBOLS_QUERY, {"function.symbols": anonymize_symbol})
    definition = editor.source_code

    first_function = next(iter(symbol_mapping.values()), function.name)

    return DecompiledFunction(
        uid=function.uid,
        path=function.path,
        name=first_function,
        definition=definition,
        assembly=assembly,
        architecture=function.architecture,
        address=function.address,
    )

set(name, symbol)

Sets the decompiler used by codablellm.

Parameters:

Name Type Description Default
name str

The display name of the decompiler (e.g., "Ghidra", "Angr").

required
symbol DynamicSymbol

A tuple containing the file path and class name of the decompiler implementation.

required
Source code in src/codablellm/core/decompiler.py
def set(name: str, symbol: DynamicSymbol) -> None:
    """
    Sets the decompiler used by `codablellm`.

    Parameters:
        name: The display name of the decompiler (e.g., "Ghidra", "Angr").
        symbol: A tuple containing the file path and class name of the decompiler implementation.
    """

    global _decompiler
    file, class_name = symbol
    old_decompiler = _decompiler
    _decompiler = RegisteredDecompiler(name, (Path(file), class_name))
    # Instantiate decompiler to ensure it can be properly imported
    try:
        create_decompiler()
    except:
        logger.error(f"Could not create {repr(name)} extractor")
        _decompiler = old_decompiler
        raise
    logger.info(f"Using {repr(name)} ({file}::{class_name}) as the decompiler")