Skip to content

Core API Reference

Pack

write_pack(input_dir, output_dir, include_patterns=None, exclude_patterns=None, no_gitignore=False, no_default_patterns=False, include_hidden=False, verbose=False, quiet=False, remove_empty_lines=False, fast_pdf=False)

Scans an input directory, parses supported files, chunks them, and generates a context pack.

This acts as the main orchestration engine for AgentPack, coordinating the Scanner, Parsers, and Chunker to produce a finalized manifest.yml and structured context directory.

Parameters:

Name Type Description Default
input_dir str

The root directory containing raw documents to scan.

required
output_dir str

The destination directory where the pack will be written.

required
include_patterns List[str]

Glob patterns of files to exclusively include.

None
exclude_patterns List[str]

Glob patterns of files to exclude.

None
no_gitignore bool

If True, ignores .gitignore and .agentpackignore files.

False
no_default_patterns bool

If True, disables built-in ignore rules (e.g. .git/).

False
include_hidden bool

If True, includes hidden files and directories.

False
verbose bool

If True, enables detailed progress logging.

False
quiet bool

If True, suppresses all non-error output.

False
remove_empty_lines bool

If True, strips empty lines from parsed text/markdown blocks to save tokens.

False
Source code in src/agentpack/pack.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def write_pack(
    input_dir: str, 
    output_dir: str,
    include_patterns: List[str] = None,
    exclude_patterns: List[str] = None,
    no_gitignore: bool = False,
    no_default_patterns: bool = False,
    include_hidden: bool = False,
    verbose: bool = False,
    quiet: bool = False,
    remove_empty_lines: bool = False,
    fast_pdf: bool = False
):
    """
    Scans an input directory, parses supported files, chunks them, and generates a context pack.

    This acts as the main orchestration engine for AgentPack, coordinating the Scanner, Parsers,
    and Chunker to produce a finalized `manifest.yml` and structured context directory.

    Args:
        input_dir (str): The root directory containing raw documents to scan.
        output_dir (str): The destination directory where the pack will be written.
        include_patterns (List[str], optional): Glob patterns of files to exclusively include.
        exclude_patterns (List[str], optional): Glob patterns of files to exclude.
        no_gitignore (bool, optional): If True, ignores `.gitignore` and `.agentpackignore` files.
        no_default_patterns (bool, optional): If True, disables built-in ignore rules (e.g. `.git/`).
        include_hidden (bool, optional): If True, includes hidden files and directories.
        verbose (bool, optional): If True, enables detailed progress logging.
        quiet (bool, optional): If True, suppresses all non-error output.
        remove_empty_lines (bool, optional): If True, strips empty lines from parsed text/markdown blocks to save tokens.
    """
    in_path = Path(input_dir)
    out_path = Path(output_dir)

    out_path.mkdir(parents=True, exist_ok=True)
    (out_path / "chunks").mkdir(exist_ok=True)
    (out_path / "tables").mkdir(exist_ok=True)
    (out_path / "reports").mkdir(exist_ok=True)

    files = scan_directory(
        input_dir,
        include_hidden=include_hidden,
        no_gitignore=no_gitignore,
        no_default_patterns=no_default_patterns,
        include_patterns=include_patterns,
        exclude_patterns=exclude_patterns
    )

    if verbose and not quiet:
        print(f"Found {len(files)} files to pack.")

    sources = []
    all_chunks = []
    all_tables = []
    cache_dir = out_path / ".cache"

    # Dispatch parallel parses; results keyed by index to preserve manifest order.
    indexed_files = [(i, fp) for i, fp in enumerate(files)
                     if get_parser(fp.suffix, fast_pdf=fast_pdf) is not None]
    docs_by_index: dict = {}

    max_workers = min(4, len(indexed_files)) if indexed_files else 1
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(
                _parse_one, fp, f"src_{i:03d}", fast_pdf, remove_empty_lines, cache_dir
            ): i
            for i, fp in indexed_files
        }
        for future in as_completed(futures):
            idx = futures[future]
            doc = future.result()
            if doc is not None:
                docs_by_index[idx] = doc

    for i, file_path in indexed_files:
        doc = docs_by_index.get(i)
        if doc is None:
            continue
        source_id = f"src_{i:03d}"

        if verbose and not quiet:
            print(f"Parsed: {file_path}")

        for block in doc.blocks:
            if block.type == "table":
                table_path = out_path / "tables" / f"{block.block_id}.md"
                with open(table_path, "w", encoding="utf-8") as f:
                    f.write(block.text)
                all_tables.append({
                    "block_id": block.block_id,
                    "source_id": source_id,
                    "page": block.page,
                    "path": f"tables/{block.block_id}.md",
                })

        doc_chunks = chunk_document(doc)
        all_chunks.extend(doc_chunks)

        has_parse_error = any(w.type == "parse_error" for w in doc.warnings)
        if has_parse_error or len(doc_chunks) == 0:
            status = "failed"
            reason = next((w.message for w in doc.warnings if w.type == "parse_error"), "produced 0 chunks")
            if not quiet:
                print(f"  WARNING: {file_path.name} failed to index ({reason})")
        else:
            status = "success"

        sources.append({
            "id": source_id,
            "path": file_path.name,
            "type": doc.type,
            "checksum": doc.checksum,
            "status": status,
            "warnings": [w.dict() for w in doc.warnings]
        })

    # Write chunks to disk
    chunks_meta = []
    for chunk in all_chunks:
        chunk_file_path = out_path / chunk.path
        with open(chunk_file_path, "w", encoding="utf-8") as f:
            f.write(chunk.content)

        chunks_meta.append({
            "id": chunk.chunk_id,
            "source_id": chunk.source_id,
            "path": chunk.path,
            "token_count": chunk.token_count,
            "citation": chunk.metadata
        })

    manifest = {
        "pack": {
            "name": in_path.name,
            "version": _get_pack_version(),
            "generated_at": datetime.now(timezone.utc).isoformat()
        },
        "sources": sources,
        "chunks": chunks_meta,
        "tables": all_tables,
        "agent": {
            "instructions": [
                "Use citations when answering.",
                "Prefer raw chunks over summaries.",
                "Say not found when the corpus does not contain the answer."
            ]
        }
    }

    with open(out_path / "manifest.yml", "w", encoding="utf-8") as f:
        yaml.dump(manifest, f, default_flow_style=False, sort_keys=False)

    # Write a simple pack report
    with open(out_path / "reports" / "pack_report.md", "w", encoding="utf-8") as f:
        f.write(f"# Pack Report\n\nGenerated from {input_dir}\n")
        f.write(f"- Sources: {len(sources)}\n")
        f.write(f"- Chunks: {len(all_chunks)}\n")

    print(f"Pack generated at {out_path}")

Retrieve

Chunker

Models

DocumentBlock

Bases: BaseModel

Represents a unified contiguous block of semantic information from a document.

Source code in src/agentpack/models.py
12
13
14
15
16
17
18
19
20
class DocumentBlock(BaseModel):
    """Represents a unified contiguous block of semantic information from a document."""
    block_id: str
    source_id: str
    type: Literal["heading", "paragraph", "table", "page"]
    text: Optional[str] = None
    page: Optional[int] = None
    section_path: List[str] = []
    row_range: Optional[Tuple[int, int]] = None

ExtractionWarning

Bases: BaseModel

Represents a warning encountered during parsing (e.g. unreadable PDF page).

Source code in src/agentpack/models.py
4
5
6
7
8
9
class ExtractionWarning(BaseModel):
    """Represents a warning encountered during parsing (e.g. unreadable PDF page)."""
    source_id: str
    page: Optional[int] = None
    type: str
    message: str

SourceDocument

Bases: BaseModel

Represents a fully parsed document normalized into the Canonical Document Model.

Source code in src/agentpack/models.py
23
24
25
26
27
28
29
30
class SourceDocument(BaseModel):
    """Represents a fully parsed document normalized into the Canonical Document Model."""
    source_id: str
    path: str
    type: Literal["pdf", "markdown", "txt", "csv"]
    checksum: str
    blocks: List[DocumentBlock]
    warnings: List[ExtractionWarning]

Audit & Validate

audit_pack(pack_dir)

Generates an audit report for an agentpack output directory.

Source code in src/agentpack/audit.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def audit_pack(pack_dir: str) -> str:
    """Generates an audit report for an agentpack output directory."""
    base_path = Path(pack_dir)
    manifest_path = base_path / "manifest.yml"

    if not manifest_path.exists():
        return f"Error: Manifest not found at {manifest_path}"

    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            manifest = yaml.safe_load(f)
    except Exception as e:
        return f"Error: Failed to parse manifest YAML: {e}"

    sources = manifest.get("sources", [])
    chunks = manifest.get("chunks", [])
    tables = manifest.get("tables", [])

    files_processed = len(sources)
    total_chunks = len(chunks)
    total_tables = len(tables)
    total_tokens = sum(chunk.get("token_count", 0) for chunk in chunks)

    max_chunk_size = 0
    largest_chunk_id = None
    for chunk in chunks:
        if chunk.get("token_count", 0) > max_chunk_size:
            max_chunk_size = chunk.get("token_count", 0)
            largest_chunk_id = chunk.get("id")

    warnings = []
    for source in sources:
        for warning in source.get("warnings", []):
            warnings.append(f"Source {source.get('id')}: [{warning.get('type')}] {warning.get('message')}")

    # Format report
    report = [
        f"# AgentPack Audit Report for '{manifest.get('pack', {}).get('name', 'Unknown')}'",
        f"Generated at: {manifest.get('pack', {}).get('generated_at', 'Unknown')}\n",
        "## Statistics",
        f"- **Files Processed:** {files_processed}",
        f"- **Total Chunks:** {total_chunks}",
        f"- **Total Tables:** {total_tables}",
        f"- **Total Tokens:** {total_tokens}",
        f"- **Largest Chunk:** {max_chunk_size} tokens (ID: {largest_chunk_id})\n",
        "## Extraction Warnings",
    ]

    if warnings:
        for warning in warnings:
            report.append(f"- {warning}")
    else:
        report.append("- No extraction warnings.")

    # Write report
    report_text = "\n".join(report)
    report_path = base_path / "reports" / "validation_report.md"
    report_path.parent.mkdir(exist_ok=True)
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report_text)

    return report_text

validate_pack(pack_dir)

Validates the integrity of an agentpack output directory.

Source code in src/agentpack/validate.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def validate_pack(pack_dir: str) -> List[str]:
    """Validates the integrity of an agentpack output directory."""
    errors = []
    base_path = Path(pack_dir)
    manifest_path = base_path / "manifest.yml"

    if not manifest_path.exists():
        return [f"Manifest not found at {manifest_path}"]

    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            manifest = yaml.safe_load(f)
    except Exception as e:
        return [f"Failed to parse manifest YAML: {e}"]

    if not manifest:
        return ["Manifest is empty."]

    # Check basic schema
    for key in ["pack", "sources", "chunks", "tables"]:
        if key not in manifest:
            errors.append(f"Manifest missing top-level key: '{key}'")

    if errors:
        return errors

    source_ids = {s.get("id") for s in manifest.get("sources", []) if s.get("id")}

    # Validate chunks
    for i, chunk in enumerate(manifest.get("chunks", [])):
        chunk_id = chunk.get("id", f"unknown_index_{i}")
        source_id = chunk.get("source_id")

        if source_id not in source_ids:
            errors.append(f"Chunk '{chunk_id}' refers to unknown source_id '{source_id}'")

        chunk_path = chunk.get("path")
        if not chunk_path:
            errors.append(f"Chunk '{chunk_id}' missing path attribute")
        else:
            full_path = base_path / chunk_path
            if not full_path.exists():
                errors.append(f"Chunk file missing: {full_path}")

        # Token validation (MVP arbitrary safe limit check)
        if chunk.get("token_count", 0) > 4000:
            errors.append(f"Chunk '{chunk_id}' exceeds safe token limit: {chunk.get('token_count')}")

    # Validate tables
    for i, table in enumerate(manifest.get("tables", [])):
        table_id = table.get("id", f"unknown_index_{i}")
        source_id = table.get("source_id")

        if source_id not in source_ids:
            errors.append(f"Table '{table_id}' refers to unknown source_id '{source_id}'")

        table_path = table.get("path")
        if table_path:
            full_path = base_path / table_path
            if not full_path.exists():
                errors.append(f"Table file missing: {full_path}")

    return errors

Scanner

scan_directory(directory, include_hidden=False, no_gitignore=False, no_default_patterns=False, include_patterns=None, exclude_patterns=None)

Recursively scans a directory and returns a list of supported files, respecting ignore rules.

Parameters:

Name Type Description Default
directory str

The root directory to scan.

required
include_hidden bool

If True, includes hidden files and directories.

False
no_gitignore bool

If True, skips loading rules from .gitignore and .agentpackignore.

False
no_default_patterns bool

If True, skips the built-in ignore lists (.git, node_modules, etc.).

False
include_patterns List[str]

If provided, only files matching these gitignore-style patterns are returned.

None
exclude_patterns List[str]

If provided, files matching these gitignore-style patterns are skipped.

None

Returns:

Type Description
List[Path]

List[Path]: A list of absolute or relative Path objects for files that should be packed.

Source code in src/agentpack/scanner.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def scan_directory(
    directory: str, 
    include_hidden: bool = False,
    no_gitignore: bool = False,
    no_default_patterns: bool = False,
    include_patterns: Optional[List[str]] = None,
    exclude_patterns: Optional[List[str]] = None
) -> List[Path]:
    """
    Recursively scans a directory and returns a list of supported files, respecting ignore rules.

    Args:
        directory (str): The root directory to scan.
        include_hidden (bool, optional): If True, includes hidden files and directories.
        no_gitignore (bool, optional): If True, skips loading rules from `.gitignore` and `.agentpackignore`.
        no_default_patterns (bool, optional): If True, skips the built-in ignore lists (`.git`, `node_modules`, etc.).
        include_patterns (List[str], optional): If provided, only files matching these gitignore-style patterns are returned.
        exclude_patterns (List[str], optional): If provided, files matching these gitignore-style patterns are skipped.

    Returns:
        List[Path]: A list of absolute or relative Path objects for files that should be packed.
    """
    paths = []
    dir_path = Path(directory)

    ignore_spec = load_ignore_spec(dir_path, no_gitignore, no_default_patterns)
    include_spec = pathspec.PathSpec.from_lines('gitignore', include_patterns) if include_patterns else None
    exclude_spec = pathspec.PathSpec.from_lines('gitignore', exclude_patterns) if exclude_patterns else None

    for root, dirs, files in os.walk(dir_path):
        rel_root = Path(root).relative_to(dir_path)

        valid_dirs = []
        for d in dirs:
            if not include_hidden and d.startswith("."):
                continue

            # Use posix path with trailing slash for directory matching
            rel_d = (rel_root / d).as_posix() + "/"
            if rel_d == "./":
                 rel_d = d + "/"
            elif rel_d.startswith("./"):
                 rel_d = rel_d[2:]

            if ignore_spec.match_file(rel_d):
                continue

            valid_dirs.append(d)

        dirs[:] = valid_dirs

        for file in files:
            if not include_hidden and file.startswith("."):
                continue

            p = Path(root) / file
            if p.suffix.lower() not in SUPPORTED_EXTENSIONS:
                continue

            rel_file = p.relative_to(dir_path).as_posix()

            if ignore_spec.match_file(rel_file):
                continue

            if exclude_spec and exclude_spec.match_file(rel_file):
                continue

            if include_spec and not include_spec.match_file(rel_file):
                continue

            paths.append(p)

    return paths