Files
microdao-daarion/site/agents/parser/index.html

1138 lines
46 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="canonical" href="https://IvanTytar.github.io/microdao-daarion/agents/parser/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.5.3, mkdocs-material-9.5.18">
<title>PARSER Agent (dots.ocr) - DAARION Documentation</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.66ac8b77.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#parser-agent-dotsocr" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="DAARION Documentation" class="md-header__button md-logo" aria-label="DAARION Documentation" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
DAARION Documentation
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
PARSER Agent (dots.ocr)
</span>
</div>
</div>
</div>
<script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="DAARION Documentation" class="md-nav__button md-logo" aria-label="DAARION Documentation" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
</a>
DAARION Documentation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../public/" class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../public/getting-started/" class="md-nav__link">
<span class="md-ellipsis">
Getting Started
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../public/architecture-overview/" class="md-nav__link">
<span class="md-ellipsis">
Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../public/daiS_daos_overview/" class="md-nav__link">
<span class="md-ellipsis">
DAIS & DAOS
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--section md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="">
<span class="md-ellipsis">
Internal
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Internal
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_1" >
<label class="md-nav__link" for="__nav_5_1" id="__nav_5_1_label" tabindex="0">
<span class="md-ellipsis">
Infra
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5_1">
<span class="md-nav__icon md-icon"></span>
Infra
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../internal/infra/INFRA_AUTOMATION_PACK_V1/" class="md-nav__link">
<span class="md-ellipsis">
Infra Automation Pack v1
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../internal/infra/monitoring_overview/" class="md-nav__link">
<span class="md-ellipsis">
Monitoring Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../internal/infra/nodes_registry_v0/" class="md-nav__link">
<span class="md-ellipsis">
Nodes Registry v0
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5_2" >
<label class="md-nav__link" for="__nav_5_2" id="__nav_5_2_label" tabindex="0">
<span class="md-ellipsis">
Specs
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_5_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5_2">
<span class="md-nav__icon md-icon"></span>
Specs
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../internal/specs/matrix_presence_aggregator/" class="md-nav__link">
<span class="md-ellipsis">
Matrix Presence Aggregator
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../internal/specs/city_map_spec/" class="md-nav__link">
<span class="md-ellipsis">
City Map Spec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../internal/specs/node_join_protocol_draft/" class="md-nav__link">
<span class="md-ellipsis">
Node Join Protocol (Draft)
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#_1" class="md-nav__link">
<span class="md-ellipsis">
Роль та призначення
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
Технічна база
</span>
</a>
<nav class="md-nav" aria-label="Технічна база">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#rednote-hilabdotsocr" class="md-nav__link">
<span class="md-ellipsis">
Модель: rednote-hilab/dots.ocr
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
Ключові можливості моделі
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
Вхідні дані
</span>
</a>
<nav class="md-nav" aria-label="Вхідні дані">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
Підтримувані формати
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
Режими виводу
</span>
</a>
<nav class="md-nav" aria-label="Режими виводу">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#1-raw_json" class="md-nav__link">
<span class="md-ellipsis">
1. raw_json
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#2-markdown" class="md-nav__link">
<span class="md-ellipsis">
2. markdown
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#3-qa_pairs" class="md-nav__link">
<span class="md-ellipsis">
3. qa_pairs
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#4-chunks" class="md-nav__link">
<span class="md-ellipsis">
4. chunks
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_7" class="md-nav__link">
<span class="md-ellipsis">
Вихідні дані
</span>
</a>
<nav class="md-nav" aria-label="Вихідні дані">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parseddocument" class="md-nav__link">
<span class="md-ellipsis">
Структура ParsedDocument
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_8" class="md-nav__link">
<span class="md-ellipsis">
Обмеження
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_9" class="md-nav__link">
<span class="md-ellipsis">
Інтеграція з системою
</span>
</a>
<nav class="md-nav" aria-label="Інтеграція з системою">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#1-dagi-router" class="md-nav__link">
<span class="md-ellipsis">
1. DAGI Router
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#2-crewai-orchestrator" class="md-nav__link">
<span class="md-ellipsis">
2. CrewAI Orchestrator
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#3-rbac-integration" class="md-nav__link">
<span class="md-ellipsis">
3. RBAC Integration
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_10" class="md-nav__link">
<span class="md-ellipsis">
Використання
</span>
</a>
<nav class="md-nav" aria-label="Використання">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#parser" class="md-nav__link">
<span class="md-ellipsis">
Приклад запиту до PARSER
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#dagi-router" class="md-nav__link">
<span class="md-ellipsis">
Приклад через DAGI Router
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_11" class="md-nav__link">
<span class="md-ellipsis">
Архітектура сервісу
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_12" class="md-nav__link">
<span class="md-ellipsis">
Залежності
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_13" class="md-nav__link">
<span class="md-ellipsis">
Посилання
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="parser-agent-dotsocr">PARSER Agent (dots.ocr)<a class="headerlink" href="#parser-agent-dotsocr" title="Permanent link">&para;</a></h1>
<p><strong>Document Ingestion &amp; Structuring Agent</strong> для DAARION / microDAO / SecondMe.</p>
<h2 id="_1">Роль та призначення<a class="headerlink" href="#_1" title="Permanent link">&para;</a></h2>
<p>PARSER — це агент, який перетворює неструктуровані документи (PDF, зображення) у структуровані дані для RAG (Retrieval-Augmented Generation) та знань-орієнтованих систем.</p>
<p><strong>Основна мета:</strong> Забезпечити якісний інжест документів у базу знань зі збереженням структури, layout та семантики.</p>
<h2 id="_2">Технічна база<a class="headerlink" href="#_2" title="Permanent link">&para;</a></h2>
<h3 id="rednote-hilabdotsocr">Модель: <code>rednote-hilab/dots.ocr</code><a class="headerlink" href="#rednote-hilabdotsocr" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>Тип:</strong> Image-Text-to-Text VLM (Vision Language Model)</li>
<li><strong>Орієнтація:</strong> Документ-орієнтований OCR з layout detection</li>
<li><strong>GitHub:</strong> https://github.com/QwenLM/Qwen3-ASR-Toolkit (або відповідний репозиторій)</li>
</ul>
<h3 id="_3">Ключові можливості моделі<a class="headerlink" href="#_3" title="Permanent link">&para;</a></h3>
<ol>
<li><strong>Мультимовний OCR + Layout</strong></li>
<li>Розпізнає текст на багатьох мовах (включаючи low-resource)</li>
<li>Правильно відновлює <strong>reading order</strong> (колонки, блоки, змішаний макет)</li>
<li>
<p>Підтримка складних макетів (наукові статті, звіти, форми)</p>
</li>
<li>
<p><strong>Єдиний VLM для всього</strong></p>
</li>
<li>Один модельний стек для <strong>layout detection + OCR</strong></li>
<li>Не потребує окремих моделей для таблиць/тексту/формул</li>
<li>
<p>Уніфікований підхід до різних типів контенту</p>
</li>
<li>
<p><strong>Структурований вихід</strong></p>
</li>
<li>JSON з блоками (<code>paragraph</code>, <code>heading</code>, <code>table</code>, <code>formula</code>, <code>figure_caption</code>, ...)</li>
<li>Bbox-координати, сторінка, читальний порядок</li>
<li>Окремі структури для таблиць (рядки/колонки, merged cells)</li>
<li>
<p>Markdown/HTML-подібний текст (таблиці можна відтворювати як Markdown)</p>
</li>
<li>
<p><strong>Орієнтація на документи</strong></p>
</li>
<li>Підтримка форм, інвойсів, звітів, наукових статей, презентацій</li>
<li>Добре працює із змішаним контентом (текст навколо формул, підписи до рисунків)</li>
</ol>
<h2 id="_4">Вхідні дані<a class="headerlink" href="#_4" title="Permanent link">&para;</a></h2>
<h3 id="_5">Підтримувані формати<a class="headerlink" href="#_5" title="Permanent link">&para;</a></h3>
<ul>
<li><strong>PDF:</strong></li>
<li>Скани (зображення сторінок)</li>
<li>"Цифрові" PDF (текст + векторна графіка)</li>
<li>
<p>Багатосторінкові документи</p>
</li>
<li>
<p><strong>Зображення:</strong></p>
</li>
<li>PNG, JPEG, TIFF</li>
<li>
<p>Підтримка різних роздільних здатностей</p>
</li>
<li>
<p><strong>Документи зі змішаним контентом:</strong></p>
</li>
<li>Текст + таблиці + схеми + формули</li>
<li>Наукові статті, звіти, презентації</li>
</ul>
<h2 id="_6">Режими виводу<a class="headerlink" href="#_6" title="Permanent link">&para;</a></h2>
<p>PARSER підтримує кілька режимів виводу (конфігурується через промпт/параметри):</p>
<h3 id="1-raw_json">1. <code>raw_json</code><a class="headerlink" href="#1-raw_json" title="Permanent link">&para;</a></h3>
<p>Повний структурований JSON з усіма блоками:</p>
<div class="codehilite"><pre><span></span><code><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;pages&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;page_num&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;blocks&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;heading&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;text&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Заголовок&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;bbox&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">x</span><span class="p">,</span><span class="w"> </span><span class="err">y</span><span class="p">,</span><span class="w"> </span><span class="err">wid</span><span class="kc">t</span><span class="err">h</span><span class="p">,</span><span class="w"> </span><span class="err">heigh</span><span class="kc">t</span><span class="p">],</span>
<span class="w"> </span><span class="nt">&quot;reading_order&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;paragraph&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;text&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Текст параграфу...&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;bbox&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">],</span>
<span class="w"> </span><span class="nt">&quot;reading_order&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">2</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;table&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;rows&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">],</span>
<span class="w"> </span><span class="nt">&quot;columns&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">],</span>
<span class="w"> </span><span class="nt">&quot;merged_cells&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">]</span>
<span class="p">}</span>
</code></pre></div>
<h3 id="2-markdown">2. <code>markdown</code><a class="headerlink" href="#2-markdown" title="Permanent link">&para;</a></h3>
<p>Таблиці/розділи у Markdown форматі:</p>
<div class="codehilite"><pre><span></span><code><span class="gh"># Заголовок</span>
Текст параграфу...
| Колонка 1 | Колонка 2 |
|-----------|-----------|
| Значення 1 | Значення 2 |
</code></pre></div>
<h3 id="3-qa_pairs">3. <code>qa_pairs</code><a class="headerlink" href="#3-qa_pairs" title="Permanent link">&para;</a></h3>
<p>Парсер одразу повертає Q&amp;A-пари по документу (через LLM-постпроцес):</p>
<div class="codehilite"><pre><span></span><code><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;qa_pairs&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;question&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Що таке токеноміка microDAO?&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;answer&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Токеноміка microDAO включає...&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;source_page&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;source_bbox&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">]</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">]</span>
<span class="p">}</span>
</code></pre></div>
<h3 id="4-chunks">4. <code>chunks</code><a class="headerlink" href="#4-chunks" title="Permanent link">&para;</a></h3>
<p>Масив семантичних фрагментів для RAG:</p>
<div class="codehilite"><pre><span></span><code><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;chunks&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;text&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;Фрагмент тексту...&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;page&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;bbox&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="err">...</span><span class="p">],</span>
<span class="w"> </span><span class="nt">&quot;section&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;introduction&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;metadata&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;dao_id&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;daarion&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;doc_id&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;tokenomics_v1&quot;</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">]</span>
<span class="p">}</span>
</code></pre></div>
<h2 id="_7">Вихідні дані<a class="headerlink" href="#_7" title="Permanent link">&para;</a></h2>
<h3 id="parseddocument">Структура <code>ParsedDocument</code><a class="headerlink" href="#parseddocument" title="Permanent link">&para;</a></h3>
<div class="codehilite"><pre><span></span><code><span class="kd">interface</span><span class="w"> </span><span class="nx">ParsedDocument</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nx">doc_id</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">doc_url?</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">doc_type</span><span class="o">:</span><span class="w"> </span><span class="s2">&quot;pdf&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;image&quot;</span><span class="p">;</span>
<span class="w"> </span><span class="nx">pages</span><span class="o">:</span><span class="w"> </span><span class="kt">ParsedPage</span><span class="p">[];</span>
<span class="w"> </span><span class="nx">metadata</span><span class="o">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nx">dao_id</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">user_id</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">uploaded_at</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">file_size</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="w"> </span><span class="nx">page_count</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="w"> </span><span class="p">};</span>
<span class="p">}</span>
<span class="kd">interface</span><span class="w"> </span><span class="nx">ParsedPage</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nx">page_num</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="w"> </span><span class="nx">blocks</span><span class="o">:</span><span class="w"> </span><span class="kt">ParsedBlock</span><span class="p">[];</span>
<span class="w"> </span><span class="nx">width</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="w"> </span><span class="nx">height</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="p">}</span>
<span class="kd">interface</span><span class="w"> </span><span class="nx">ParsedBlock</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kr">type</span><span class="o">:</span><span class="w"> </span><span class="s2">&quot;paragraph&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;heading&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;table&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;formula&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;figure_caption&quot;</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="s2">&quot;list&quot;</span><span class="p">;</span>
<span class="w"> </span><span class="nx">text</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">;</span>
<span class="w"> </span><span class="nx">bbox</span><span class="o">:</span><span class="w"> </span><span class="p">[</span><span class="nx">x</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">y</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">width</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">height</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">];</span>
<span class="w"> </span><span class="nx">reading_order</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">;</span>
<span class="w"> </span><span class="c1">// Для таблиць:</span>
<span class="w"> </span><span class="nx">table_data</span><span class="o">?:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nx">rows</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">[][];</span>
<span class="w"> </span><span class="nx">columns</span><span class="o">:</span><span class="w"> </span><span class="kt">string</span><span class="p">[];</span>
<span class="w"> </span><span class="nx">merged_cells?</span><span class="o">:</span><span class="w"> </span><span class="kt">Array</span><span class="o">&lt;</span><span class="p">{</span><span class="nx">row</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">col</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">rowspan</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">,</span><span class="w"> </span><span class="nx">colspan</span><span class="o">:</span><span class="w"> </span><span class="kt">number</span><span class="p">}</span><span class="o">&gt;</span><span class="p">;</span>
<span class="w"> </span><span class="p">};</span>
<span class="p">}</span>
</code></pre></div>
<h2 id="_8">Обмеження<a class="headerlink" href="#_8" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>Max pages:</strong> Конфігурується через <code>PARSER_MAX_PAGES</code> (за замовчуванням: 100)</li>
<li><strong>Max resolution:</strong> Конфігурується через <code>PARSER_MAX_RESOLUTION</code> (за замовчуванням: 4096x4096)</li>
<li><strong>Max file size:</strong> Залежить від runtime (рекомендовано: до 50MB для PDF)</li>
<li><strong>Підтримка мов:</strong> Залежить від моделі dots.ocr (українська підтримується)</li>
</ul>
<h2 id="_9">Інтеграція з системою<a class="headerlink" href="#_9" title="Permanent link">&para;</a></h2>
<h3 id="1-dagi-router">1. DAGI Router<a class="headerlink" href="#1-dagi-router" title="Permanent link">&para;</a></h3>
<p>PARSER інтегрується як окремий провайдер:</p>
<div class="codehilite"><pre><span></span><code><span class="nt">providers</span><span class="p">:</span>
<span class="w"> </span><span class="nt">parser</span><span class="p">:</span>
<span class="w"> </span><span class="nt">type</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">ocr</span>
<span class="w"> </span><span class="nt">base_url</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;http://parser-service:9400&quot;</span>
</code></pre></div>
<p><strong>Routing rule:</strong></p>
<div class="codehilite"><pre><span></span><code><span class="nt">routing</span><span class="p">:</span>
<span class="w"> </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="nt">id</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">doc_parse</span>
<span class="w"> </span><span class="nt">when</span><span class="p">:</span>
<span class="w"> </span><span class="nt">mode</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">doc_parse</span>
<span class="w"> </span><span class="nt">use_provider</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">parser</span>
</code></pre></div>
<h3 id="2-crewai-orchestrator">2. CrewAI Orchestrator<a class="headerlink" href="#2-crewai-orchestrator" title="Permanent link">&para;</a></h3>
<p>PARSER як агент у CrewAI workflow:</p>
<ul>
<li><strong><code>doc_ingest_workflow</code>:</strong> Перевірка типу документа → виклик PARSER → інжест у RAG</li>
<li><strong><code>rag_answer_workflow</code>:</strong> Використання розпарсених документів для відповідей</li>
</ul>
<h3 id="3-rbac-integration">3. RBAC Integration<a class="headerlink" href="#3-rbac-integration" title="Permanent link">&para;</a></h3>
<ul>
<li>Перевірка прав на інжест документів (<code>role: admin</code>, <code>role: researcher</code>)</li>
<li>Обмеження на приватні/публічні документи</li>
<li>Перевірка <code>dao_id</code> для ізоляції даних</li>
</ul>
<h2 id="_10">Використання<a class="headerlink" href="#_10" title="Permanent link">&para;</a></h2>
<h3 id="parser">Приклад запиту до PARSER<a class="headerlink" href="#parser" title="Permanent link">&para;</a></h3>
<div class="codehilite"><pre><span></span><code>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://parser-service:9400/ocr/parse<span class="w"> </span><span class="se">\</span>
<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<span class="s1"> &quot;doc_url&quot;: &quot;https://example.com/tokenomics.pdf&quot;,</span>
<span class="s1"> &quot;output_mode&quot;: &quot;chunks&quot;,</span>
<span class="s1"> &quot;dao_id&quot;: &quot;daarion&quot;,</span>
<span class="s1"> &quot;user_id&quot;: &quot;user123&quot;</span>
<span class="s1"> }&#39;</span>
</code></pre></div>
<h3 id="dagi-router">Приклад через DAGI Router<a class="headerlink" href="#dagi-router" title="Permanent link">&para;</a></h3>
<div class="codehilite"><pre><span></span><code>curl<span class="w"> </span>-X<span class="w"> </span>POST<span class="w"> </span>http://router:9102/route<span class="w"> </span><span class="se">\</span>
<span class="w"> </span>-H<span class="w"> </span><span class="s2">&quot;Content-Type: application/json&quot;</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>-d<span class="w"> </span><span class="s1">&#39;{</span>
<span class="s1"> &quot;mode&quot;: &quot;doc_parse&quot;,</span>
<span class="s1"> &quot;dao_id&quot;: &quot;daarion&quot;,</span>
<span class="s1"> &quot;user_id&quot;: &quot;user123&quot;,</span>
<span class="s1"> &quot;payload&quot;: {</span>
<span class="s1"> &quot;doc_url&quot;: &quot;https://example.com/tokenomics.pdf&quot;,</span>
<span class="s1"> &quot;output_mode&quot;: &quot;qa_pairs&quot;</span>
<span class="s1"> }</span>
<span class="s1"> }&#39;</span>
</code></pre></div>
<h2 id="_11">Архітектура сервісу<a class="headerlink" href="#_11" title="Permanent link">&para;</a></h2>
<div class="codehilite"><pre><span></span><code><span class="n">parser</span><span class="o">-</span><span class="n">service</span><span class="o">/</span>
<span class="err">├──</span><span class="w"> </span><span class="n">main</span><span class="o">.</span><span class="n">py</span><span class="w"> </span><span class="c1"># FastAPI сервіс</span>
<span class="err">├──</span><span class="w"> </span><span class="n">parser_runtime</span><span class="o">/</span><span class="w"> </span><span class="c1"># Runtime для dots.ocr</span>
<span class="err"></span><span class="w"> </span><span class="err">├──</span><span class="w"> </span><span class="n">__init__</span><span class="o">.</span><span class="n">py</span>
<span class="err"></span><span class="w"> </span><span class="err">├──</span><span class="w"> </span><span class="n">model_loader</span><span class="o">.</span><span class="n">py</span><span class="w"> </span><span class="c1"># Lazy init, GPU/CPU fallback</span>
<span class="err"></span><span class="w"> </span><span class="err">└──</span><span class="w"> </span><span class="n">inference</span><span class="o">.</span><span class="n">py</span><span class="w"> </span><span class="c1"># parse_image, parse_pdf</span>
<span class="err">├──</span><span class="w"> </span><span class="n">schemas</span><span class="o">.</span><span class="n">py</span><span class="w"> </span><span class="c1"># Pydantic моделі</span>
<span class="err">└──</span><span class="w"> </span><span class="n">config</span><span class="o">.</span><span class="n">py</span><span class="w"> </span><span class="c1"># Конфігурація</span>
</code></pre></div>
<h2 id="_12">Залежності<a class="headerlink" href="#_12" title="Permanent link">&para;</a></h2>
<ul>
<li><strong>Runtime:</strong> HuggingFace Transformers + vLLM/SGLang (або llama.cpp/GGUF)</li>
<li><strong>Модель:</strong> <code>rednote-hilab/dots.ocr</code></li>
<li><strong>Python:</strong> 3.11+</li>
<li><strong>GPU:</strong> Рекомендовано (можна CPU fallback)</li>
</ul>
<h2 id="_13">Посилання<a class="headerlink" href="#_13" title="Permanent link">&para;</a></h2>
<ul>
<li><a href="../TODO-PARSER-RAG.md">TODO: PARSER + RAG Implementation</a></li>
<li><a href="./dagi-router.md">DAGI Router Documentation</a></li>
<li><a href="./crewai-orchestrator.md">CrewAI Orchestrator</a></li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "../..", "features": ["navigation.sections", "navigation.instant", "content.code.copy"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
<script src="../../assets/javascripts/bundle.3220b9d7.min.js"></script>
</body>
</html>