<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">ellibs</journal-id><journal-title-group><journal-title xml:lang="ru">Электронные библиотеки</journal-title><trans-title-group xml:lang="en"><trans-title>Russian Digital Libraries Journal</trans-title></trans-title-group></journal-title-group><issn pub-type="epub">1562-5419</issn><publisher><publisher-name>Казанский (Приволжский) федеральный университет</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.26907/1562-5419-2025-28-6-1346-1367</article-id><article-id custom-type="elpub" pub-id-type="custom">ellibs-623</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>Статьи</subject></subj-group></article-categories><title-group><article-title>Инструмент для оперативной диагностики памяти нейросетевых архитектур языковых моделей</article-title><trans-title-group xml:lang="en"><trans-title>A Tool for Rapid Diagnostics of Memory in Neural Network Architectures of Language Models</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Гавриков</surname><given-names>Павел Андреевич</given-names></name><name name-style="western" xml:lang="en"><surname>Gavrikov</surname><given-names>Pavel Andreevich</given-names></name></name-alternatives><email xlink:type="simple">avrikov.pa@phystech.edu</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Усманов</surname><given-names>Азамат Комилжон</given-names></name><name name-style="western" xml:lang="en"><surname>Usmanov</surname><given-names>Azamat Komiljon</given-names></name></name-alternatives><email xlink:type="simple">usmanov.ak@phystech.edu</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Реваев</surname><given-names>Дмитрий</given-names></name><name name-style="western" xml:lang="en"><surname>Revayev</surname><given-names>Dmitriy</given-names></name></name-alternatives><email xlink:type="simple">revaev.d@phystech.edu</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Бузыканов</surname><given-names>Сергей Николаевич</given-names></name><name name-style="western" xml:lang="en"><surname>Buzykanov</surname><given-names>Sergey Nikolaevich</given-names></name></name-alternatives><email xlink:type="simple">bsn1977@mail.ru</email><xref ref-type="aff" rid="aff-1"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru"><institution>Московский физико-технический институт</institution></aff><aff xml:lang="en"><institution>Moscow Institute of Physics and Technology</institution></aff></aff-alternatives><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>12</month><year>2025</year></pub-date><volume>28</volume><issue>6</issue><fpage>1346</fpage><lpage>1367</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Гавриков П.А., Усманов А.К., Реваев Д., Бузыканов С.Н., 2025</copyright-statement><copyright-year>2025</copyright-year><copyright-holder xml:lang="ru">Гавриков П.А., Усманов А.К., Реваев Д., Бузыканов С.Н.</copyright-holder><copyright-holder xml:lang="en">Gavrikov P.A., Usmanov A.K., Revayev D., Buzykanov S.N.</copyright-holder><license xml:lang="ru" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>Данная работа распространяется под лицензией Creative Commons Attribution 4.0.</license-p></license><license xml:lang="en" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://ellibs.elpub.ru/jour/article/view/623">https://ellibs.elpub.ru/jour/article/view/623</self-uri><abstract><p>Большие языковые модели (Large Language Models, LLM) прошли путь от простых N-граммных систем до современных универсальных архитектур, однако ключевым ограничением остается квадратичная сложность механизма самовнимания по длине входной последовательности. Это существенно увеличивает потребление памяти и вычислительных ресурсов, а с появлением задач, требующих рекордно длинных контекстов, создает необходимость разработки новых архитектурных решений. Поскольку для исследования предлагаемой архитектуры требуется длительное и дорогостоящее обучение полновесной сети, необходимо разработать инструмент, который позволял бы быстро дать предварительную оценку архитектуре с точки зрения внутренней памяти.


В настоящей работе предложен метод количественной оценки внутренней памяти нейросетевых архитектур на основе синтетических тестов, не требующих больших корпусов данных. Под внутренней памятью понимается объем информации, который модель способна воспроизвести без обращения к исходным входам.


Для верификации подхода разработан программный комплекс, апробированный на архитектурах GPT-2 и Mamba. Использованы задачи копирования, инверсии и извлечения значения по ключу. Проведенное сравнение по точности предсказаний, распределению ошибок и вычислительным затратам позволяет оперативно оценивать эффективность и перспективность архитектур LLM.
</p></abstract><trans-abstract xml:lang="en"><p>Large Language Models (LLMs) have evolved from simple n-gram systems to modern universal architectures; however, a key limitation remains the quadratic complexity of the self-attention mechanism with respect to input sequence length. This significantly increases memory consumption and computational costs, and with the emergence of tasks requiring extremely long contexts, creates the need for new architectural solutions. Since evaluating a proposed architecture typically requires long and expensive full-scale training, it is necessary to develop a tool that allows for a rapid preliminary assessment of a model’s internal memory capacity.


This paper presents a method for quantitative evaluation of the internal memory of neural network architectures based on synthetic tests that do not require large data corpora. Internal memory is defined as the amount of information a model can reproduce without direct access to its original inputs.


To validate the approach, a software framework was developed and tested on the GPT-2 and Mamba architectures. The experiments employed copy, inversion, and associative retrieval tasks. Comparison of prediction accuracy, error distribution, and computational cost enables a fast assessment of the efficiency and potential of various LLM architectures.
</p></trans-abstract><kwd-group xml:lang="ru"><kwd>большие языковые модели</kwd><kwd>архитектура нейросетей</kwd><kwd>внутренняя память</kwd><kwd>долговременное хранение информации</kwd><kwd>обработка последовательностей</kwd><kwd>измерение функциональной памяти</kwd><kwd>сравнение архитектур</kwd></kwd-group><kwd-group xml:lang="en"><kwd>large language models</kwd><kwd>neural network architecture</kwd><kwd>internal memory</kwd><kwd>long-term information retention</kwd><kwd>sequence processing</kwd><kwd>functional memory measurement</kwd><kwd>architecture comparison</kwd></kwd-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">Kaplan J., McCandlish S., Henighan T., et al. Scaling Laws for Neural Language Models // arXiv preprint arXiv:2001.08361. 2020. https://doi.org/10.48550/arXiv.2001.08361</mixed-citation><mixed-citation xml:lang="en">Kaplan J., McCandlish S., Henighan T., et al. Scaling Laws for Neural Language Models // arXiv preprint arXiv:2001.08361. 2020. https://doi.org/10.48550/arXiv.2001.08361</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Brown T., Mann B., Ryder N., et al. Language Models are Few‑Shot Learners // Advances in Neural Information Processing Systems. 2020. Vol. 33. P. 1877-1901. https://doi.org/10.5555/3495724.3495883</mixed-citation><mixed-citation xml:lang="en">Brown T., Mann B., Ryder N., et al. Language Models are Few‑Shot Learners // Advances in Neural Information Processing Systems. 2020. Vol. 33. P. 1877-1901. https://doi.org/10.5555/3495724.3495883</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Beltagy I., Peters M. E., Cohan A. Longformer: The Long‑Document Transformer // arXiv preprint arXiv:2004.05150. 2020. https://doi.org/10.48550/arXiv.2004.05150</mixed-citation><mixed-citation xml:lang="en">Beltagy I., Peters M. E., Cohan A. Longformer: The Long‑Document Transformer // arXiv preprint arXiv:2004.05150. 2020. https://doi.org/10.48550/arXiv.2004.05150</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Radford A., Wu J., Child R., Luan D., Amodei D., Sutskever I. Language Models are Unsupervised Multitask Learners // OpenAI. 2019.</mixed-citation><mixed-citation xml:lang="en">Radford A., Wu J., Child R., Luan D., Amodei D., Sutskever I. Language Models are Unsupervised Multitask Learners // OpenAI. 2019.</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Common Crawl Foundation. Common Crawl dataset. https://commoncrawl.org</mixed-citation><mixed-citation xml:lang="en">Common Crawl Foundation. Common Crawl dataset. https://commoncrawl.org</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Gu A., Goel K., Ré C. Efficiently Modeling Long Sequences with Structured State Spaces // International Conference on Learning Representations (ICLR). 2022.</mixed-citation><mixed-citation xml:lang="en">Gu A., Goel K., Ré C. Efficiently Modeling Long Sequences with Structured State Spaces // International Conference on Learning Representations (ICLR). 2022.</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Gao L., Biderman S., Black S., et al. The Pile: An 800 GB Dataset of Diverse Text for Language Modeling // arXiv preprint arXiv:2101.00027. 2020. https://doi.org/10.48550/arXiv.2101.00027</mixed-citation><mixed-citation xml:lang="en">Gao L., Biderman S., Black S., et al. The Pile: An 800 GB Dataset of Diverse Text for Language Modeling // arXiv preprint arXiv:2101.00027. 2020. https://doi.org/10.48550/arXiv.2101.00027</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Eldan R., Li Y. TinyStories: How Small Can Language Models Be and Still Speak Coherent English? // arXiv preprint arXiv:2305.07759. 2023. https://doi.org/10.48550/arXiv.2305.07759</mixed-citation><mixed-citation xml:lang="en">Eldan R., Li Y. TinyStories: How Small Can Language Models Be and Still Speak Coherent English? // arXiv preprint arXiv:2305.07759. 2023. https://doi.org/10.48550/arXiv.2305.07759</mixed-citation></citation-alternatives></ref><ref id="cit9"><label>9</label><citation-alternatives><mixed-citation xml:lang="ru">Dao T. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness // Advances in Neural Information Processing Systems (NeurIPS). 2022. Vol. 35. P. 16344-16359. https://doi.org/10.48550/arXiv.2205.14135</mixed-citation><mixed-citation xml:lang="en">Dao T. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness // Advances in Neural Information Processing Systems (NeurIPS). 2022. Vol. 35. P. 16344-16359. https://doi.org/10.48550/arXiv.2205.14135</mixed-citation></citation-alternatives></ref><ref id="cit10"><label>10</label><citation-alternatives><mixed-citation xml:lang="ru">Gu A., Goel K., Dao T., et al. Mamba: Linear-Time Sequence Modeling with Selective State Spaces // International Conference on Learning Representations (ICLR). 2024.</mixed-citation><mixed-citation xml:lang="en">Gu A., Goel K., Dao T., et al. Mamba: Linear-Time Sequence Modeling with Selective State Spaces // International Conference on Learning Representations (ICLR). 2024.</mixed-citation></citation-alternatives></ref><ref id="cit11"><label>11</label><citation-alternatives><mixed-citation xml:lang="ru">Kwon W., Lee S., Li S., Zaharia M., Zhang H., Stoica I., Sheng Y., Crichton W., Xie S., Gonzalez J. Efficient Memory Management for Large Language Model Inference with KV-Caching // arXiv preprint arXiv:2309.06180. 2023. https://doi.org/10.48550/arXiv.2309.06180</mixed-citation><mixed-citation xml:lang="en">Kwon W., Lee S., Li S., Zaharia M., Zhang H., Stoica I., Sheng Y., Crichton W., Xie S., Gonzalez J. Efficient Memory Management for Large Language Model Inference with KV-Caching // arXiv preprint arXiv:2309.06180. 2023. https://doi.org/10.48550/arXiv.2309.06180</mixed-citation></citation-alternatives></ref><ref id="cit12"><label>12</label><citation-alternatives><mixed-citation xml:lang="ru">Vaswani A., Shazeer N., Parmar N., et al. Attention Is All You Need // Advances in Neural Information Processing Systems (NIPS). 2017. Vol. 30. P. 5998–6008. https://doi.org/10.5555/3295222.3295349</mixed-citation><mixed-citation xml:lang="en">Vaswani A., Shazeer N., Parmar N., et al. Attention Is All You Need // Advances in Neural Information Processing Systems (NIPS). 2017. Vol. 30. P. 5998–6008. https://doi.org/10.5555/3295222.3295349</mixed-citation></citation-alternatives></ref><ref id="cit13"><label>13</label><citation-alternatives><mixed-citation xml:lang="ru">Tay Y., Bahri D., Metzler D., et al. Long Range Arena: A Benchmark for Efficient Transformers // arXiv preprint arXiv:2011.04006. 2020. https://doi.org/10.48550/arXiv.2011.04006</mixed-citation><mixed-citation xml:lang="en">Tay Y., Bahri D., Metzler D., et al. Long Range Arena: A Benchmark for Efficient Transformers // arXiv preprint arXiv:2011.04006. 2020. https://doi.org/10.48550/arXiv.2011.04006</mixed-citation></citation-alternatives></ref><ref id="cit14"><label>14</label><citation-alternatives><mixed-citation xml:lang="ru">Bulatov A., Kuratov Y., Burtsev M. Recurrent Memory Transformer // Advances in Neural Information Processing Systems. 2022. Vol. 35. P. 11079-11091. https://doi.org/10.48550/arXiv.2207.06881</mixed-citation><mixed-citation xml:lang="en">Bulatov A., Kuratov Y., Burtsev M. Recurrent Memory Transformer // Advances in Neural Information Processing Systems. 2022. Vol. 35. P. 11079-11091. https://doi.org/10.48550/arXiv.2207.06881</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
