Skip to content

cli

The codablellm command line interface.

command(repo=REPO, save_as=SAVE_AS, bins=BINS, accurate=ACCURATE, build=BUILD, build_error_handling=BUILD_ERROR_HANDLING, cleanup=CLEANUP, cleanup_error_handling=CLEANUP_ERROR_HANDLING, checkpoint=CHECKPOINT, debug=DEBUG, decompile=DECOMPILE, decompiler=DECOMPILER, exclude_subpath=EXCLUDE_SUBPATH, exclusive_subpath=EXCLUSIVE_SUBPATH, extractors=EXTRACTORS, extra_path=EXTRA_PATH, generation_mode=GENERATION_MODE, git=GIT, ghidra=GHIDRA, ghidra_script=GHIDRA_SCRIPT, mapper=MAPPER, max_workers=MAX_WORKERS, parallel=PARALLEL, recursive=RECURSIVE, run_from=RUN_FROM, strict=STRICT, symbol_remover=SYMBOL_REMOVER, transform=TRANSFORM, use_checkpoint=USE_CHECKPOINT, url=URL, verbose=VERBOSE, version=VERSION)

Creates a code dataset from a local repository.

Source code in src/codablellm/cli.py
@app.command()
def command(
    repo: Path = REPO,
    save_as: Path = SAVE_AS,
    bins: Optional[List[Path]] = BINS,
    accurate: bool = ACCURATE,
    build: Optional[str] = BUILD,
    build_error_handling: CommandErrorHandler = BUILD_ERROR_HANDLING,
    cleanup: Optional[str] = CLEANUP,
    cleanup_error_handling: CommandErrorHandler = CLEANUP_ERROR_HANDLING,
    checkpoint: int = CHECKPOINT,
    debug: bool = DEBUG,
    decompile: bool = DECOMPILE,
    decompiler: DynamicSymbol = DECOMPILER,
    exclude_subpath: Optional[List[Path]] = EXCLUDE_SUBPATH,
    exclusive_subpath: Optional[List[Path]] = EXCLUSIVE_SUBPATH,
    extractors: Optional[Tuple[ExtractorConfigOperation, Path]] = EXTRACTORS,
    extra_path: List[Path] = EXTRA_PATH,
    generation_mode: GenerationMode = GENERATION_MODE,
    git: bool = GIT,
    ghidra: Optional[Path] = GHIDRA,
    ghidra_script: Path = GHIDRA_SCRIPT,
    mapper: DynamicSymbol = MAPPER,
    max_workers: Optional[int] = MAX_WORKERS,
    parallel: bool = PARALLEL,
    recursive: bool = RECURSIVE,
    run_from: RunFrom = RUN_FROM,
    strict: bool = STRICT,
    symbol_remover: Optional[SymbolRemover] = SYMBOL_REMOVER,
    transform: Optional[DynamicSymbol] = TRANSFORM,
    use_checkpoint: Optional[bool] = USE_CHECKPOINT,
    url: str = URL,
    verbose: bool = VERBOSE,
    version: bool = VERSION,
) -> None:
    """
    Creates a code dataset from a local repository.
    """
    if decompiler != codablellm.decompiler.get().symbol:
        # Configure decompiler
        codablellm.decompiler.set(f"(CLI-Set) {decompiler[1]}", decompiler)
    if extractors:
        # Configure function extractors
        operation, config_file = extractors
        try:
            # Load JSON file containing extractors
            configured_extractors: Dict[str, DynamicSymbol] = json.loads(
                Path.read_text(config_file)
            )
        except json.JSONDecodeError as e:
            raise BadParameter(
                "Could not decode extractor configuration file.",
                param_hint="--extractors",
            ) from e
        if operation == ExtractorConfigOperation.SET:
            codablellm.extractor.set_registered(configured_extractors)
        else:
            for language, symbol in configured_extractors.items():
                order = (
                    "last" if operation == ExtractorConfigOperation.APPEND else "first"
                )
                codablellm.extractor.register(language, symbol, order=order)
    if url:
        # Download remote repository
        if git:
            downloader.clone(url, repo)
        else:
            downloader.decompress(url, repo)
    # Create the extractor configuration
    extract_config = ExtractConfig(
        accurate_progress=accurate,
        transform=transform,
        exclusive_subpaths=set(exclusive_subpath) if exclusive_subpath else set(),
        exclude_subpaths=set(exclude_subpath) if exclude_subpath else set(),
        checkpoint=checkpoint,
        use_checkpoint=True,
        strict=strict,
    )
    if build:
        logger.warning(
            "--build specified without --decompile. --decompile enabled "
            "automatically."
        )
        decompile = True
    # Create source code/decompiled code dataset
    if decompile:
        if not bins or not any(bins):
            raise BadParameter(
                "Must specify at least one binary for decompiled code datasets.",
                param_hint="bins",
            )
        dataset_config = DecompiledCodeDatasetConfig(
            extract_config=extract_config,
            decompiler_config=DecompileConfig(
                symbol_remover=symbol_remover,  # type: ignore
                recursive=recursive,
                strict=strict,
            ),
            mapper=mapper,
        )
        if not build:
            dataset = codablellm.create_decompiled_dataset(
                repo, bins, extract_config=extract_config, dataset_config=dataset_config
            )
        else:
            manage_config = ManageConfig(
                cleanup_command=shlex.split(cleanup) if cleanup else None,
                run_from=run_from,  # type: ignore
                build_error_handling=build_error_handling,  # type: ignore
                cleanup_error_handling=cleanup_error_handling,  # type: ignore
                extra_paths=extra_path,
            )
            dataset = codablellm.compile_dataset(
                repo,
                bins,
                shlex.split(build),
                manage_config=manage_config,
                extract_config=extract_config,
                dataset_config=dataset_config,
                generation_mode=generation_mode,  # type: ignore
            )
    else:
        dataset_config = SourceCodeDatasetConfig(
            generation_mode=str(generation_mode),  # type: ignore
            extract_config=extract_config,
        )
        dataset = codablellm.create_source_dataset(repo, config=dataset_config)
    # Save dataset
    dataset.save_as(save_as)