<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">ellibs</journal-id><journal-title-group><journal-title xml:lang="ru">Электронные библиотеки</journal-title><trans-title-group xml:lang="en"><trans-title>Russian Digital Libraries Journal</trans-title></trans-title-group></journal-title-group><issn pub-type="epub">1562-5419</issn><publisher><publisher-name>Казанский (Приволжский) федеральный университет</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.26907/1562-5419-2025-28-5-1085-1102</article-id><article-id custom-type="elpub" pub-id-type="custom">ellibs-610</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>Статьи</subject></subj-group></article-categories><title-group><article-title>Нейросимволический подход к дополненной генерации текста на основе автоматизированной индукции морфотактических правил</article-title><trans-title-group xml:lang="en"><trans-title>Neuro-Symbolic Approach to Augmented Text Generation via Automated Induction of Morphotactic Rules</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Исангулов</surname><given-names>Марат Вильданович</given-names></name><name name-style="western" xml:lang="en"><surname>Isangulov</surname><given-names>Marat Vildanovich</given-names></name></name-alternatives><email xlink:type="simple">marathon.our@gmail.com</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Елизаров</surname><given-names>Александр Михайлович</given-names></name><name name-style="western" xml:lang="en"><surname>Elizarov</surname><given-names>Alexander Mikhailovich</given-names></name></name-alternatives><email xlink:type="simple">amelizarov@gmail.com</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Кунафин</surname><given-names>Айгиз Ражапович</given-names></name><name name-style="western" xml:lang="en"><surname>Kunafin</surname><given-names>Aygiz Razhapovich</given-names></name></name-alternatives><email xlink:type="simple">aigizk@gmail.com</email></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Гатиатуллин</surname><given-names>Айрат Рафизович</given-names></name><name name-style="western" xml:lang="en"><surname>Gatiatullin</surname><given-names>Airat Rafizovich</given-names></name></name-alternatives><email xlink:type="simple">ayrat.gatiatullin@gmail.com</email><xref ref-type="aff" rid="aff-3"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Прокопьев</surname><given-names>Николай Аркадиевич</given-names></name><name name-style="western" xml:lang="en"><surname>Prokopyev</surname><given-names>Nikolai Arkadievich</given-names></name></name-alternatives><email xlink:type="simple">nikolai.prokopyev@gmail.com</email><xref ref-type="aff" rid="aff-4"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru"><institution>Казанский (Приволжский) федеральный университет</institution></aff><aff xml:lang="en"><institution>Kazan (Volga region) Federal University</institution></aff></aff-alternatives><aff-alternatives id="aff-2"><aff xml:lang="ru"><institution>Академия наук Республики Татарстан</institution></aff><aff xml:lang="en"><institution>Academy of Sciences of the Republic of Tatarstan</institution></aff></aff-alternatives><aff-alternatives id="aff-3"><aff xml:lang="ru"><institution>Академия наук Республики Татарстан</institution></aff><aff xml:lang="en"><institution>Tatarstan Academy of Sciences</institution></aff></aff-alternatives><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>12</month><year>2025</year></pub-date><volume>28</volume><issue>5</issue><fpage>1085</fpage><lpage>1102</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Исангулов М.В., Елизаров А.М., Кунафин А.Р., Гатиатуллин А.Р., Прокопьев Н.А., 2025</copyright-statement><copyright-year>2025</copyright-year><copyright-holder xml:lang="ru">Исангулов М.В., Елизаров А.М., Кунафин А.Р., Гатиатуллин А.Р., Прокопьев Н.А.</copyright-holder><copyright-holder xml:lang="en">Isangulov M.V., Elizarov A.M., Kunafin A.R., Gatiatullin A.R., Prokopyev N.A.</copyright-holder><license xml:lang="ru" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>Данная работа распространяется под лицензией Creative Commons Attribution 4.0.</license-p></license><license xml:lang="en" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://ellibs.elpub.ru/jour/article/view/610">https://ellibs.elpub.ru/jour/article/view/610</self-uri><abstract><p>Представлен гибридный нейросимволический метод, который объединяет большую языковую модель (LLM) и конечный автомат (FST) для обеспечения морфологической корректности при генерации текста на агглютинативных языках. 
Система автоматически извлекает правила из корпусных данных: для локальных примеров словоформ LLM формирует цепочки морфологического разбора, которые затем агрегируются и упорядочиваются в компактные описания правил морфотактики (LEXC) и выбора алломорфов (regex). На этапе генерации LLM и FST работают совместно: если токен не распознается автоматом, LLM извлекает из контекста пару «лемма + теги», а FST реализует корректную поверхностную форму. В качестве набора данных использован корпус художественной литературы (~1600 предложений). Для списка из 50 существительных извлечено 250 словоформ. По предложенному алгоритму LLM сгенерировала 110 контекстных regex-правил вместе с LEXC-морфотактикой, на основе чего был скомпилирован FST, распознавший 170/250 форм (~70%). В прикладном тесте машинного перевода на подкорпусе из 300 предложений интеграция данного FST в цикл LLM повысила качество с BLEU 16.14 / ChrF 45.13 до BLEU 25.71 / ChrF 50.87 без дообучения переводчика. Подход применим к иным частям речи и другим агглютинативным и малоресурсным языкам, где он может быть использован для наполнения словарных и грамматических ресурсов.
</p></abstract><trans-abstract xml:lang="en"><p>The work presents a hybrid neuro-symbolic method that combines a large language model (LLM) and a finite-state transducer (FST) to ensure morphological correctness in text generation for agglutinative languages. The system automatically extracts rules from corpus data: for local examples of word forms, the LLM produces sequences of morphological analyses, which are then aggregated and organized into compact descriptions of morphotactic rules (LEXC) and allomorph selection (regex). During generation, the LLM and FST operate jointly: if a token is not recognized by the automaton, the LLM derives a “lemma+tags” pair from the context, and the FST produces the correct surface form. A literary corpus (~1600 sentences) was used as the dataset. For a list of 50 nouns, 250 word forms were extracted. Using the proposed algorithm, the LLM generated 110 context-sensitive regex rules along with LEXC morphotactics, from which an FST was compiled that recognized 170/250 forms (~70%). In an applied machine translation test on a subcorpus of 300 sentences, integrating this FST into the LLM cycle improved quality from BLEU 16.14 / ChrF 45.13 to BLEU 25.71 / ChrF 50.87 without retraining the translator. The approach scales to other parts of speech (verbs, adjectives, etc.) as well as to other agglutinative and low-resource languages, where it can accelerate the development of lexical and grammatical resources.
</p></trans-abstract><kwd-group xml:lang="ru"><kwd>нейросимволический подход</kwd><kwd>большая языковая модель</kwd><kwd>конечные автоматы</kwd><kwd>двухуровневая морфология</kwd><kwd>LEXC морфотактика</kwd><kwd>машинный перевод</kwd><kwd>агглютинативные языки</kwd><kwd>башкирский язык</kwd></kwd-group><kwd-group xml:lang="en"><kwd>neuro-symbolic approach</kwd><kwd>large language model</kwd><kwd>finite-state transducers</kwd><kwd>two-level morphology</kwd><kwd>LEXC morphotactics</kwd><kwd>machine translation</kwd><kwd>agglutinative languages</kwd><kwd>Bashkir language</kwd></kwd-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Sproat R., Østling R. The morphological gap between translation quality and surface accuracy // Proceedings of the WMT 2020 Conference. Online, 2020. P. 1015–1024.</mixed-citation><mixed-citation xml:lang="en">Sproat R., Østling R. The morphological gap between translation quality and surface accuracy // Proceedings of the WMT 2020 Conference. Online, 2020. P. 1015–1024.</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Kann K., Cotterell R., Schütze H. Neural models of inflectional morphology // Proceedings of the 15th Conference of the European Chapter of the ACL (EACL 2017). Valencia, 2017. P. 322–334.</mixed-citation><mixed-citation xml:lang="en">Kann K., Cotterell R., Schütze H. Neural models of inflectional morphology // Proceedings of the 15th Conference of the European Chapter of the ACL (EACL 2017). Valencia, 2017. P. 322–334.</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Mielke S., Eisenstein J., Cotterell R. Dialect-to-dialect translation and cross-dialect morphological robustness of language models // Transactions of the ACL. 2021. Vol. 9. P. 288–302.</mixed-citation><mixed-citation xml:lang="en">Mielke S., Eisenstein J., Cotterell R. Dialect-to-dialect translation and cross-dialect morphological robustness of language models // Transactions of the ACL. 2021. Vol. 9. P. 288–302.</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Koskenniemi K. Two-level morphology: a general computational model for word-form recognition and production. Helsinki: University of Helsinki, Department of General Linguistics, 1983. 38 p.</mixed-citation><mixed-citation xml:lang="en">Koskenniemi K. Two-level morphology: a general computational model for word-form recognition and production. Helsinki: University of Helsinki, Department of General Linguistics, 1983. 38 p.</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Beesley K.R., Karttunen L. Finite-State Morphology. Stanford (CA): CSLI Publications, 2003. 550 p.</mixed-citation><mixed-citation xml:lang="en">Beesley K.R., Karttunen L. Finite-State Morphology. Stanford (CA): CSLI Publications, 2003. 550 p.</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Stahlberg F., Hasler E., Waite A. SGNMT: A flexible NMT decoding toolkit for quick prototyping of new models // Proceedings of ACL System Demonstrations. Vancouver, 2017. P. 67–72.</mixed-citation><mixed-citation xml:lang="en">Stahlberg F., Hasler E., Waite A. SGNMT: A flexible NMT decoding toolkit for quick prototyping of new models // Proceedings of ACL System Demonstrations. Vancouver, 2017. P. 67–72.</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Hulden M. FST-based grammar correction for richly inflected languages // Proceedings of ACL Workshop on Finite-State Methods. Montréal, 2012. P. 32–39.</mixed-citation><mixed-citation xml:lang="en">Hulden M. FST-based grammar correction for richly inflected languages // Proceedings of ACL Workshop on Finite-State Methods. Montréal, 2012. P. 32–39.</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Tamchyna A., Bojar O. Target-side context for morphological reinflection // Proceedings of the First Conference on Machine Translation (WMT 2016). Berlin, 2016. P. 586–594.</mixed-citation><mixed-citation xml:lang="en">Tamchyna A., Bojar O. Target-side context for morphological reinflection // Proceedings of the First Conference on Machine Translation (WMT 2016). Berlin, 2016. P. 586–594.</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Schwartz L., Liu S., Surrain S. Bootstrapping a neural morphological analyzer from an existing FST // Proceedings of the ACL Workshop on Morphological Resources 2022. Seattle, 2022. P. 12–20.</mixed-citation><mixed-citation xml:lang="en">Schwartz L., Liu S., Surrain S. Bootstrapping a neural morphological analyzer from an existing FST // Proceedings of the ACL Workshop on Morphological Resources 2022. Seattle, 2022. P. 12–20.</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
