Core API Reference

Pack

`write_pack(input_dir, output_dir, include_patterns=None, exclude_patterns=None, no_gitignore=False, no_default_patterns=False, include_hidden=False, verbose=False, quiet=False, remove_empty_lines=False, fast_pdf=False)`

Scans an input directory, parses supported files, chunks them, and generates a context pack.

This acts as the main orchestration engine for AgentPack, coordinating the Scanner, Parsers, and Chunker to produce a finalized manifest.yml and structured context directory.

Parameters:

Name	Type	Description	Default
`input_dir`	`str`	The root directory containing raw documents to scan.	required
`output_dir`	`str`	The destination directory where the pack will be written.	required
`include_patterns`	`List[str]`	Glob patterns of files to exclusively include.	`None`
`exclude_patterns`	`List[str]`	Glob patterns of files to exclude.	`None`
`no_gitignore`	`bool`	If True, ignores `.gitignore` and `.agentpackignore` files.	`False`
`no_default_patterns`	`bool`	If True, disables built-in ignore rules (e.g. `.git/`).	`False`
`include_hidden`	`bool`	If True, includes hidden files and directories.	`False`
`verbose`	`bool`	If True, enables detailed progress logging.	`False`
`quiet`	`bool`	If True, suppresses all non-error output.	`False`
`remove_empty_lines`	`bool`	If True, strips empty lines from parsed text/markdown blocks to save tokens.	`False`

Source code in src/agentpack/pack.py

def write_pack(
    input_dir: str, 
    output_dir: str,
    include_patterns: List[str] = None,
    exclude_patterns: List[str] = None,
    no_gitignore: bool = False,
    no_default_patterns: bool = False,
    include_hidden: bool = False,
    verbose: bool = False,
    quiet: bool = False,
    remove_empty_lines: bool = False,
    fast_pdf: bool = False
):
    """
    Scans an input directory, parses supported files, chunks them, and generates a context pack.

    This acts as the main orchestration engine for AgentPack, coordinating the Scanner, Parsers,
    and Chunker to produce a finalized `manifest.yml` and structured context directory.

    Args:
        input_dir (str): The root directory containing raw documents to scan.
        output_dir (str): The destination directory where the pack will be written.
        include_patterns (List[str], optional): Glob patterns of files to exclusively include.
        exclude_patterns (List[str], optional): Glob patterns of files to exclude.
        no_gitignore (bool, optional): If True, ignores `.gitignore` and `.agentpackignore` files.
        no_default_patterns (bool, optional): If True, disables built-in ignore rules (e.g. `.git/`).
        include_hidden (bool, optional): If True, includes hidden files and directories.
        verbose (bool, optional): If True, enables detailed progress logging.
        quiet (bool, optional): If True, suppresses all non-error output.
        remove_empty_lines (bool, optional): If True, strips empty lines from parsed text/markdown blocks to save tokens.
    """
    in_path = Path(input_dir)
    out_path = Path(output_dir)

    out_path.mkdir(parents=True, exist_ok=True)
    (out_path / "chunks").mkdir(exist_ok=True)
    (out_path / "tables").mkdir(exist_ok=True)
    (out_path / "reports").mkdir(exist_ok=True)

    files = scan_directory(
        input_dir,
        include_hidden=include_hidden,
        no_gitignore=no_gitignore,
        no_default_patterns=no_default_patterns,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns
    )

    if verbose and not quiet:
        print(f"Found {len(files)} files to pack.")

    sources = []
    all_chunks = []
    all_tables = []
    cache_dir = out_path / ".cache"

    # Dispatch parallel parses; results keyed by index to preserve manifest order.
    indexed_files = [(i, fp) for i, fp in enumerate(files)
                     if get_parser(fp.suffix, fast_pdf=fast_pdf) is not None]
    docs_by_index: dict = {}

    max_workers = min(4, len(indexed_files)) if indexed_files else 1
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(
                _parse_one, fp, f"src_{i:03d}", fast_pdf, remove_empty_lines, cache_dir
            ): i
            for i, fp in indexed_files
        }
        for future in as_completed(futures):
            idx = futures[future]
            doc = future.result()
            if doc is not None:
                docs_by_index[idx] = doc

    for i, file_path in indexed_files:
        doc = docs_by_index.get(i)
        if doc is None:
            continue
        source_id = f"src_{i:03d}"

        if verbose and not quiet:
            print(f"Parsed: {file_path}")

        for block in doc.blocks:
            if block.type == "table":
                table_path = out_path / "tables" / f"{block.block_id}.md"
                with open(table_path, "w", encoding="utf-8") as f:
                    f.write(block.text)
                all_tables.append({
                    "block_id": block.block_id,
                    "source_id": source_id,
                    "page": block.page,
                    "path": f"tables/{block.block_id}.md",
                })

        doc_chunks = chunk_document(doc)
        all_chunks.extend(doc_chunks)

        has_parse_error = any(w.type == "parse_error" for w in doc.warnings)
        if has_parse_error or len(doc_chunks) == 0:
            status = "failed"
            reason = next((w.message for w in doc.warnings if w.type == "parse_error"), "produced 0 chunks")
            if not quiet:
                print(f"  WARNING: {file_path.name} failed to index ({reason})")
        else:
            status = "success"

        sources.append({
            "id": source_id,
            "path": file_path.name,
            "type": doc.type,
            "checksum": doc.checksum,
            "status": status,
            "warnings": [w.dict() for w in doc.warnings]
        })

    # Write chunks to disk
    chunks_meta = []
    for chunk in all_chunks:
        chunk_file_path = out_path / chunk.path
        with open(chunk_file_path, "w", encoding="utf-8") as f:
            f.write(chunk.content)

        chunks_meta.append({
            "id": chunk.chunk_id,
            "source_id": chunk.source_id,
            "path": chunk.path,
            "token_count": chunk.token_count,
            "citation": chunk.metadata
        })

    manifest = {
        "pack": {
            "name": in_path.name,
            "version": _get_pack_version(),
            "generated_at": datetime.now(timezone.utc).isoformat()
        },
        "sources": sources,
        "chunks": chunks_meta,
        "tables": all_tables,
        "agent": {
            "instructions": [
                "Use citations when answering.",
                "Prefer raw chunks over summaries.",
                "Say not found when the corpus does not contain the answer."
            ]
        }
    }

    with open(out_path / "manifest.yml", "w", encoding="utf-8") as f:
        yaml.dump(manifest, f, default_flow_style=False, sort_keys=False)

    # Write a simple pack report
    with open(out_path / "reports" / "pack_report.md", "w", encoding="utf-8") as f:
        f.write(f"# Pack Report\n\nGenerated from {input_dir}\n")
        f.write(f"- Sources: {len(sources)}\n")
        f.write(f"- Chunks: {len(all_chunks)}\n")

    print(f"Pack generated at {out_path}")

Retrieve

Chunker

Models

`DocumentBlock`

Bases: BaseModel

Represents a unified contiguous block of semantic information from a document.

Source code in src/agentpack/models.py

class DocumentBlock(BaseModel):
    """Represents a unified contiguous block of semantic information from a document."""
    block_id: str
    source_id: str
    type: Literal["heading", "paragraph", "table", "page"]
    text: Optional[str] = None
    page: Optional[int] = None
    section_path: List[str] = []
    row_range: Optional[Tuple[int, int]] = None

`ExtractionWarning`

Bases: BaseModel

Represents a warning encountered during parsing (e.g. unreadable PDF page).

Source code in src/agentpack/models.py

class ExtractionWarning(BaseModel):
    """Represents a warning encountered during parsing (e.g. unreadable PDF page)."""
    source_id: str
    page: Optional[int] = None
    type: str
    message: str

`SourceDocument`

Bases: BaseModel

Represents a fully parsed document normalized into the Canonical Document Model.

Source code in src/agentpack/models.py

class SourceDocument(BaseModel):
    """Represents a fully parsed document normalized into the Canonical Document Model."""
    source_id: str
    path: str
    type: Literal["pdf", "markdown", "txt", "csv"]
    checksum: str
    blocks: List[DocumentBlock]
    warnings: List[ExtractionWarning]

Audit & Validate

`audit_pack(pack_dir)`

Generates an audit report for an agentpack output directory.

Source code in src/agentpack/audit.py

def audit_pack(pack_dir: str) -> str:
    """Generates an audit report for an agentpack output directory."""
    base_path = Path(pack_dir)
    manifest_path = base_path / "manifest.yml"

    if not manifest_path.exists():
        return f"Error: Manifest not found at {manifest_path}"

    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            manifest = yaml.safe_load(f)
    except Exception as e:
        return f"Error: Failed to parse manifest YAML: {e}"

    sources = manifest.get("sources", [])
    chunks = manifest.get("chunks", [])
    tables = manifest.get("tables", [])

    files_processed = len(sources)
    total_chunks = len(chunks)
    total_tables = len(tables)
    total_tokens = sum(chunk.get("token_count", 0) for chunk in chunks)

    max_chunk_size = 0
    largest_chunk_id = None
    for chunk in chunks:
        if chunk.get("token_count", 0) > max_chunk_size:
            max_chunk_size = chunk.get("token_count", 0)
            largest_chunk_id = chunk.get("id")

    warnings = []
    for source in sources:
        for warning in source.get("warnings", []):
            warnings.append(f"Source {source.get('id')}: [{warning.get('type')}] {warning.get('message')}")

    # Format report
    report = [
        f"# AgentPack Audit Report for '{manifest.get('pack', {}).get('name', 'Unknown')}'",
        f"Generated at: {manifest.get('pack', {}).get('generated_at', 'Unknown')}\n",
        "## Statistics",
        f"- **Files Processed:** {files_processed}",
        f"- **Total Chunks:** {total_chunks}",
        f"- **Total Tables:** {total_tables}",
        f"- **Total Tokens:** {total_tokens}",
        f"- **Largest Chunk:** {max_chunk_size} tokens (ID: {largest_chunk_id})\n",
        "## Extraction Warnings",
    ]

    if warnings:
        for warning in warnings:
            report.append(f"- {warning}")
    else:
        report.append("- No extraction warnings.")

    # Write report
    report_text = "\n".join(report)
    report_path = base_path / "reports" / "validation_report.md"
    report_path.parent.mkdir(exist_ok=True)
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report_text)

    return report_text

`validate_pack(pack_dir)`

Validates the integrity of an agentpack output directory.

Source code in src/agentpack/validate.py

def validate_pack(pack_dir: str) -> List[str]:
    """Validates the integrity of an agentpack output directory."""
    errors = []
    base_path = Path(pack_dir)
    manifest_path = base_path / "manifest.yml"

    if not manifest_path.exists():
        return [f"Manifest not found at {manifest_path}"]

    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            manifest = yaml.safe_load(f)
    except Exception as e:
        return [f"Failed to parse manifest YAML: {e}"]

    if not manifest:
        return ["Manifest is empty."]

    # Check basic schema
    for key in ["pack", "sources", "chunks", "tables"]:
        if key not in manifest:
            errors.append(f"Manifest missing top-level key: '{key}'")

    if errors:
        return errors

    source_ids = {s.get("id") for s in manifest.get("sources", []) if s.get("id")}

    # Validate chunks
    for i, chunk in enumerate(manifest.get("chunks", [])):
        chunk_id = chunk.get("id", f"unknown_index_{i}")
        source_id = chunk.get("source_id")

        if source_id not in source_ids:
            errors.append(f"Chunk '{chunk_id}' refers to unknown source_id '{source_id}'")

        chunk_path = chunk.get("path")
        if not chunk_path:
            errors.append(f"Chunk '{chunk_id}' missing path attribute")
        else:
            full_path = base_path / chunk_path
            if not full_path.exists():
                errors.append(f"Chunk file missing: {full_path}")

        # Token validation (MVP arbitrary safe limit check)
        if chunk.get("token_count", 0) > 4000:
            errors.append(f"Chunk '{chunk_id}' exceeds safe token limit: {chunk.get('token_count')}")

    # Validate tables
    for i, table in enumerate(manifest.get("tables", [])):
        table_id = table.get("id", f"unknown_index_{i}")
        source_id = table.get("source_id")

        if source_id not in source_ids:
            errors.append(f"Table '{table_id}' refers to unknown source_id '{source_id}'")

        table_path = table.get("path")
        if table_path:
            full_path = base_path / table_path
            if not full_path.exists():
                errors.append(f"Table file missing: {full_path}")

    return errors

Scanner

`scan_directory(directory, include_hidden=False, no_gitignore=False, no_default_patterns=False, include_patterns=None, exclude_patterns=None)`

Recursively scans a directory and returns a list of supported files, respecting ignore rules.

Parameters:

Name	Type	Description	Default
`directory`	`str`	The root directory to scan.	required
`include_hidden`	`bool`	If True, includes hidden files and directories.	`False`
`no_gitignore`	`bool`	If True, skips loading rules from `.gitignore` and `.agentpackignore`.	`False`
`no_default_patterns`	`bool`	If True, skips the built-in ignore lists (`.git`, `node_modules`, etc.).	`False`
`include_patterns`	`List[str]`	If provided, only files matching these gitignore-style patterns are returned.	`None`
`exclude_patterns`	`List[str]`	If provided, files matching these gitignore-style patterns are skipped.	`None`

Returns:

Type	Description
`List[Path]`	List[Path]: A list of absolute or relative Path objects for files that should be packed.

Source code in src/agentpack/scanner.py

def scan_directory(
    directory: str, 
    include_hidden: bool = False,
    no_gitignore: bool = False,
    no_default_patterns: bool = False,
    include_patterns: Optional[List[str]] = None,
    exclude_patterns: Optional[List[str]] = None
) -> List[Path]:
    """
    Recursively scans a directory and returns a list of supported files, respecting ignore rules.

    Args:
        directory (str): The root directory to scan.
        include_hidden (bool, optional): If True, includes hidden files and directories.
        no_gitignore (bool, optional): If True, skips loading rules from `.gitignore` and `.agentpackignore`.
        no_default_patterns (bool, optional): If True, skips the built-in ignore lists (`.git`, `node_modules`, etc.).
        include_patterns (List[str], optional): If provided, only files matching these gitignore-style patterns are returned.
        exclude_patterns (List[str], optional): If provided, files matching these gitignore-style patterns are skipped.

    Returns:
        List[Path]: A list of absolute or relative Path objects for files that should be packed.
    """
    paths = []
    dir_path = Path(directory)

    ignore_spec = load_ignore_spec(dir_path, no_gitignore, no_default_patterns)
    include_spec = pathspec.PathSpec.from_lines('gitignore', include_patterns) if include_patterns else None
    exclude_spec = pathspec.PathSpec.from_lines('gitignore', exclude_patterns) if exclude_patterns else None

    for root, dirs, files in os.walk(dir_path):
        rel_root = Path(root).relative_to(dir_path)

        valid_dirs = []
        for d in dirs:
            if not include_hidden and d.startswith("."):
                continue

            # Use posix path with trailing slash for directory matching
            rel_d = (rel_root / d).as_posix() + "/"
            if rel_d == "./":
                 rel_d = d + "/"
            elif rel_d.startswith("./"):
                 rel_d = rel_d[2:]

            if ignore_spec.match_file(rel_d):
                continue

            valid_dirs.append(d)

        dirs[:] = valid_dirs

        for file in files:
            if not include_hidden and file.startswith("."):
                continue

            p = Path(root) / file
            if p.suffix.lower() not in SUPPORTED_EXTENSIONS:
                continue

            rel_file = p.relative_to(dir_path).as_posix()

            if ignore_spec.match_file(rel_file):
                continue

            if exclude_spec and exclude_spec.match_file(rel_file):
                continue

            if include_spec and not include_spec.match_file(rel_file):
                continue

            paths.append(p)

    return paths