@inproceedings{sikasote-anastasopoulos21bembaspeech, title = {BembaSpeech: A Speech Recognition Corpus for the Bemba Language}, author = {Sikasote, Claytone and Anastasopoulos, Antonios}, booktitle = {Proceedings of AfricaNLP}, address = {Online}, month = {April}, year = {2021}, url = {https://arxiv.org/pdf/2102.04889.pdf} } @misc{li20comparison, title = {Comparison of Interactive Knowledge Base Spelling Correction Models for Low-Resource Languages}, author = {Li, Yiyuan and Anastasopoulos, Antonios and Black, Alan W}, address = {Online}, month = {October}, year = {2020}, url = {https://arxiv.org/abs/2010.10472}, note = {{arXiv}:2010.10472} } @inproceedings{hossain20emnlpfindings, title = {It's not a Non-Issue: Negation as a Source of Error in Machine Translation}, author = {Hossain, Md Mosharaf and Anastasopoulos, Antonios and Blanco, Eduardo and Palmer, Alexis}, booktitle = {Findings of EMNLP}, address = {Online}, month = {November}, year = {2020}, url = {https://arxiv.org/pdf/2010.05432} } @inproceedings{alam-anastasopoulos-2020-fine, title = "Fine-Tuning {MT} systems for Robustness to Second-Language Speaker Variations", author = "Alam, Md Mahfuz Ibn and Anastasopoulos, Antonios", booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)", .month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.wnut-1.20", pages = "149--158", abstract = "The performance of neural machine translation (NMT) systems only trained on a single language variant degrades when confronted with even slightly different language variations. With this work, we build upon previous work to explore how to mitigate this issue. We show that fine-tuning using naturally occurring noise along with pseudo-references (i.e. {``}corrected{''} non-native inputs translated using the baseline NMT system) is a promising solution towards systems robust to such type of input variations. We focus on four translation pairs, from English to Spanish, Italian, French, and Portuguese, with our system achieving improvements of up to 3.1 BLEU points compared to the baselines, establishing a new state-of-the-art on the JFLEG-ES dataset. All datasets and code are publicly available here: https://github.com/mahfuzibnalam/finetuning{\_}for{\_}robustness .", } @misc{muller20unseen, title = {When Being Unseen from {mBERT} is just the Beginning: Handling New Languages With Multilingual {LM}s}, author = {Muller, Benjamin and Anastasopoulos, Antonios and Sagot, Beno\^{i}t and Seddah, Djam\'{e}}, url={http://pauillac.inria.fr/~seddah/Unseen_languages_Mbert.pdf}, year = {2020} } @inproceedings{rijhwani-etal-2020-ocr, title = "{OCR} Post-Correction for Endangered Language Texts", author = "Rijhwani, Shruti and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.478", pages = "5931--5942", } @inproceedings{jiang-etal-2020-x, title = "{X}-{FACTR}: Multilingual Factual Knowledge Retrieval from Pretrained Language Models", author = "Jiang, Zhengbao and Anastasopoulos, Antonios and Araki, Jun and Ding, Haibo and Neubig, Graham", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.479", pages = "5943--5959", } @inproceedings{chaudhary-etal-2020-automatic, title = "Automatic Extraction of Rules Governing Morphological Agreement", author = "Chaudhary, Aditi and Anastasopoulos, Antonios and Pratapa, Adithya and Mortensen, David R. and Sheikh, Zaid and Tsvetkov, Yulia and Neubig, Graham", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.422", pages = "5212--5236", } @inproceedings{dou-etal-2020-dynamic, title = "Dynamic Data Selection and Weighting for Iterative Back-Translation", author = "Dou, Zi-Yi and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.475", pages = "5894--5904", } @inproceedings{zhao-etal-2020-automatic, title = "Automatic Interlinear Glossing for Under-Resourced Languages Leveraging Translations", author = "Zhao, Xingyuan and Ozaki, Satoru and Anastasopoulos, Antonios and Neubig, Graham and Levin, Lori", booktitle = "Proceedings of the 28th International Conference on Computational Linguistics", month = dec, year = "2020", address = "Barcelona, Spain (Online)", publisher = "International Committee on Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.coling-main.471", doi = "10.18653/v1/2020.coling-main.471", pages = "5397--5408", abstract = "Interlinear Glossed Text (IGT) is a widely used format for encoding linguistic information in language documentation projects and scholarly papers. Manual production of IGT takes time and requires linguistic expertise. We attempt to address this issue by creating automatic glossing models, using modern multi-source neural models that additionally leverage easy-to-collect translations. We further explore cross-lingual transfer and a simple output length control mechanism, further refining our models. Evaluated on three challenging low-resource scenarios, our approach significantly outperforms a recent, state-of-the-art baseline, particularly improving on overall accuracy as well as lemma and tag recall.", } @article{chaudhary21reducing, title = {Reducing Confusion in Active Learning for Part-Of-Speech Tagging}, author = {Chaudhary, Aditi and Anastasopoulos, Antonios and Sheikh, Zaid and Neubig, Graham}, journal = {Transactions of the Association for Computational Linguistics (TACL)}, month = {February}, year = {2020}, doi = {10.1162/tacl\_a\_00350}, url = {https://www.mitpressjournals.org/doi/full/10.1162/tacl_a_00350}, abstract = { Active learning (AL) uses a data selection algorithm to select useful training samples to minimize annotation cost. This is now an essential tool for building low-resource syntactic analyzers such as part-of-speech (POS) taggers. Existing AL heuristics are generally designed on the principle of selecting uncertain yet representative training instances, where annotating these instances may reduce a large number of errors. However, in an empirical study across six typologically diverse languages (German, Swedish, Galician, North Sami, Persian, and Ukrainian), we found the surprising result that even in an oracle scenario where we know the true uncertainty of predictions, these current heuristics are far from optimal. Based on this analysis, we pose the problem of AL as selecting instances that maximally reduce the confusion between particular pairs of output tags. Extensive experimentation on the aforementioned languages shows that our proposed AL strategy outperforms other AL strategies by a significant margin. We also present auxiliary results demonstrating the importance of proper calibration of models, which we ensure through cross-view training, and analysis demonstrating how our proposed strategy selects examples that more closely follow the oracle data distribution. The code is publicly released here.1 } } @inproceedings{anastasopoulos-etal-2020-TICO19, abstract = {The COVID-19 pandemic is the worst pandemic to strike the world in over a century. Crucial to stemming the tide of the SARS-CoV-2 virus is communicating to vulnerable populations the means by which they can protect themselves. To this end, the collaborators forming the Translation Initiative for COvid-19 (TICO-19) have made test and development data available to AI and MT researchers in 35 different languages in order to foster the development of tools and resources for improving access to information about COVID-19 in these languages. In addition to 9 high-resourced, ``pivot'' languages, the team is targeting 26 lesser resourced languages, in particular languages of Africa, South Asia and South-East Asia, whose populations may be the most vulnerable to the spread of the virus. The same data is translated into all of the languages represented, meaning that testing or development can be done for any pairing of languages in the set. Further, the team is converting the test and development data into translation memories (TMXs) that can be used by localizers from and to any of the languages. As the project continues and we create data for more languages, we will keep updating this paper as well as the project's website.}, author = {Anastasopoulos, Antonios and Cattelan, Alessandro and Dou, Zi-Yi and Federico, Marcello and Federman, Christian and Genzel, Dmitriy and Guzm\'{a}n, Francisco and Hu, Junjie and Hughes, Macduff and Koehn, Philipp and Lazar, Rosie and Lewis, Will and Neubig, Graham and Niu, Mengmeng and \"{O}ktem, Alp and Paquin, Eric and Tang, Grace and Tur, Sylwia }, booktitle = {NLP COVID-19 Workshop}, title = {{TICO}-19: the {T}ranslation Initiative for {CO}vid-19}, address = {Online}, month = {November}, url = {https://openreview.net/pdf?id=-0xPrt01VXD}, year = {2020} } @inproceedings{bugliarello-etal-2020-easier, title = "It{'}s Easier to Translate out of {E}nglish than into it: {M}easuring Neural Translation Difficulty by Cross-Mutual Information", author = "Bugliarello, Emanuele and Mielke, Sabrina J. and Anastasopoulos, Antonios and Cotterell, Ryan and Okazaki, Naoaki", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.149", doi = "10.18653/v1/2020.acl-main.149", pages = "1640--1649", abstract = "The performance of neural machine translation systems is commonly evaluated in terms of BLEU. However, due to its reliance on target language properties and generation, the BLEU metric does not allow an assessment of which translation directions are more difficult to model. In this paper, we propose cross-mutual information (XMI): an asymmetric information-theoretic metric of machine translation difficulty that exploits the probabilistic nature of most neural machine translation models. XMI allows us to better evaluate the difficulty of translating text into the target language while controlling for the difficulty of the target-side generation component independent of the translation task. We then present the first systematic and controlled study of cross-lingual translation difficulties using modern neural translation systems. Code for replicating our experiments is available online at https://github.com/e-bug/nmt-difficulty.", } @inproceedings{xia-etal-2020-predicting, title = "Predicting Performance for Natural Language Processing Tasks", author = "Xia, Mengzhou and Anastasopoulos, Antonios and Xu, Ruochen and Yang, Yiming and Neubig, Graham", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.764", doi = "10.18653/v1/2020.acl-main.764", pages = "8625--8646", abstract = "Given the complexity of combinations of tasks, languages, and domains in natural language processing (NLP) research, it is computationally prohibitive to exhaustively test newly proposed models on each possible experimental setting. In this work, we attempt to explore the possibility of gaining plausible judgments of how well an NLP model can perform under an experimental setting, \textit{without actually training or testing the model}. To do so, we build regression models to predict the evaluation score of an NLP experiment given the experimental settings as input. Experimenting on{\textasciitilde}9 different NLP tasks, we find that our predictors can produce meaningful predictions over unseen languages and different modeling architectures, outperforming reasonable baselines as well as human experts. {\%}we represent experimental settings using an array of features. Going further, we outline how our predictor can be used to find a small subset of representative experiments that should be run in order to obtain plausible predictions for all other experimental settings.", } @inproceedings{anastasopoulos-neubig-2020-cross, title = "Should All Cross-Lingual Embeddings Speak {E}nglish?", author = "Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.766", doi = "10.18653/v1/2020.acl-main.766", pages = "8658--8679", abstract = "Most of recent work in cross-lingual word embeddings is severely Anglocentric. The vast majority of lexicon induction evaluation dictionaries are between English and another language, and the English embedding space is selected by default as the hub when learning in a multilingual setting. With this work, however, we challenge these practices. First, we show that the choice of hub language can significantly impact downstream lexicon induction zero-shot POS tagging performance. Second, we both expand a standard English-centered evaluation dictionary collection to include all language pairs using triangulation, and create new dictionaries for under-represented languages. Evaluating established methods over all these language pairs sheds light into their suitability for aligning embeddings from distant languages and presents new challenges for the field. Finally, in our analysis we identify general guidelines for strong cross-lingual embedding baselines, that extend to language pairs that do not include English.", } @proceedings{winlp-2020-widening, title = "Proceedings of the The Fourth Widening Natural Language Processing Workshop", editor = "Cunha, Rossana and Shaikh, Samira and Varis, Erika and Georgi, Ryan and Tsai, Alicia and Anastasopoulos, Antonios and Chandu, Khyathi Raghavi", month = jul, year = "2020", address = "Seattle, USA", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.winlp-1.0", } @inproceedings{vylomova-etal-2020-sigmorphon, title = "{SIGMORPHON} 2020 Shared Task 0: Typologically Diverse Morphological Inflection", author = "Vylomova, Ekaterina and White, Jennifer and Salesky, Elizabeth and Mielke, Sabrina J. and Wu, Shijie and Ponti, Edoardo Maria and Hall Maudslay, Rowan and Zmigrod, Ran and Valvoda, Josef and Toldova, Svetlana and Tyers, Francis and Klyachko, Elena and Yegorov, Ilya and Krizhanovsky, Natalia and Czarnowska, Paula and Nikkarinen, Irene and Krizhanovsky, Andrew and Pimentel, Tiago and Torroba Hennigen, Lucas and Kirov, Christo and Nicolai, Garrett and Williams, Adina and Anastasopoulos, Antonios and Cruz, Hilaria and Chodroff, Eleanor and Cotterell, Ryan and Silfverberg, Miikka and Hulden, Mans", booktitle = "Proceedings of the 17th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.sigmorphon-1.1", doi = "10.18653/v1/2020.sigmorphon-1.1", pages = "1--39", abstract = "A broad goal in natural language processing (NLP) is to develop a system that has the capacity to process any natural language. Most systems, however, are developed using data from just one language such as English. The SIGMORPHON 2020 shared task on morphological reinflection aims to investigate systems{'} ability to generalize across typologically distinct languages, many of which are low resource. Systems were developed using data from 45 languages and just 5 language families, fine-tuned with data from an additional 45 languages and 10 language families (13 in total), and evaluated on all 90 languages. A total of 22 systems (19 neural) from 10 teams were submitted to the task. All four winning systems were neural (two monolingual transformers and two massively multilingual RNN-based models with gated attention). Most teams demonstrate utility of data hallucination and augmentation, ensembles, and multilingual training for low-resource languages. Non-neural learners and manually designed grammars showed competitive and even superior performance on some languages (such as Ingrian, Tajik, Tagalog, Zarma, Lingala), especially with very limited data. Some language families (Afro-Asiatic, Niger-Congo, Turkic) were relatively easy for most systems and achieved over 90{\%} mean accuracy while others were more challenging.", } @inproceedings{murikinati-anastasopoulos-2020-cmu, title = "The {CMU}-{LTI} submission to the {SIGMORPHON} 2020 Shared Task 0: Language-Specific Cross-Lingual Transfer", author = "Murikinati, Nikitha and Anastasopoulos, Antonios", booktitle = "Proceedings of the 17th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.sigmorphon-1.6", doi = "10.18653/v1/2020.sigmorphon-1.6", pages = "79--84", abstract = "This paper describes the CMU-LTI submission to the SIGMORPHON 2020 Shared Task 0 on typologically diverse morphological inflection. The (unrestricted) submission uses the cross-lingual approach of our last year{'}s winning submission (Anastasopoulos and Neubig, 2019), but adapted to use specific transfer languages for each test language. Our system, with fixed non-tuned hyperparameters, achieved a macro-averaged accuracy of 80.65 ranking 20th among 31 systems, but it was still tied for best system in 25 of the 90 total languages.", } @inproceedings{murikinati-etal-2020-transliteration, title = "Transliteration for Cross-Lingual Morphological Inflection", author = "Murikinati, Nikitha and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 17th SIGMORPHON Workshop on Computational Research in Phonetics, Phonology, and Morphology", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.sigmorphon-1.22", doi = "10.18653/v1/2020.sigmorphon-1.22", pages = "189--197", abstract = "Cross-lingual transfer between typologically related languages has been proven successful for the task of morphological inflection. However, if the languages do not share the same script, current methods yield more modest improvements. We explore the use of transliteration between related languages, as well as grapheme-to-phoneme conversion, as data preprocessing methods in order to alleviate this issue. We experimented with several diverse language pairs, finding that in most cases transliterating the transfer language data into the target one leads to accuracy improvements, even up to 9 percentage points. Converting both languages into a shared space like the International Phonetic Alphabet or the Latin alphabet is also beneficial, leading to improvements of up to 16 percentage points.", } @inproceedings{cruz-etal-2020-resource, title = "A Resource for Studying Chatino Verbal Morphology", author = "Cruz, Hilaria and Anastasopoulos, Antonios and Stump, Gregory", booktitle = "Proceedings of The 12th Language Resources and Evaluation Conference", month = may, year = "2020", address = "Marseille, France", publisher = "European Language Resources Association", url = "https://www.aclweb.org/anthology/2020.lrec-1.344", pages = "2827--2831", abstract = "We present the first resource focusing on the verbal inflectional morphology of San Juan Quiahije Chatino, a tonal mesoamerican language spoken in Mexico. We provide a collection of complete inflection tables of 198 lemmata, with morphological tags based on the UniMorph schema. We also provide baseline results on three core NLP tasks: morphological analysis, lemmatization, and morphological inflection.", language = "English", ISBN = "979-10-95546-34-4", } @inproceedings{duan-etal-2020-resource, title = "A Resource for Computational Experiments on Mapudungun", author = "Duan, Mingjun and Fasola, Carlos and Rallabandi, Sai Krishna and Vega, Rodolfo and Anastasopoulos, Antonios and Levin, Lori and Black, Alan W", booktitle = "Proceedings of The 12th Language Resources and Evaluation Conference", month = may, year = "2020", address = "Marseille, France", publisher = "European Language Resources Association", url = "https://www.aclweb.org/anthology/2020.lrec-1.350", pages = "2872--2877", abstract = "We present a resource for computational experiments on Mapudungun, a polysynthetic indigenous language spoken in Chile with upwards of 200 thousand speakers. We provide 142 hours of culturally significant conversations in the domain of medical treatment. The conversations are fully transcribed and translated into Spanish. The transcriptions also include annotations for code-switching and non-standard pronunciations. We also provide baseline results on three core NLP tasks: speech recognition, speech synthesis, and machine translation between Spanish and Mapudungun. We further explore other applications for which the corpus will be suitable, including the study of code-switching, historical orthography change, linguistic structure, and sociological and anthropological studies.", language = "English", ISBN = "979-10-95546-34-4", } @inproceedings{mortensen-etal-2020-allovera, title = "{A}llo{V}era: A Multilingual Allophone Database", author = "Mortensen, David R. and Li, Xinjian and Littell, Patrick and Michaud, Alexis and Rijhwani, Shruti and Anastasopoulos, Antonios and Black, Alan W and Metze, Florian and Neubig, Graham", booktitle = "Proceedings of The 12th Language Resources and Evaluation Conference", month = may, year = "2020", address = "Marseille, France", publisher = "European Language Resources Association", url = "https://www.aclweb.org/anthology/2020.lrec-1.656", pages = "5329--5336", abstract = "We introduce a new resource, AlloVera, which provides mappings from 218 allophones to phonemes for 14 languages. Phonemes are contrastive phonological units, and allophones are their various concrete realizations, which are predictable from phonological context. While phonemic representations are language specific, phonetic representations (stated in terms of (allo)phones) are much closer to a universal (language-independent) transcription. AlloVera allows the training of speech recognition models that output phonetic transcriptions in the International Phonetic Alphabet (IPA), regardless of the input language. We show that a {``}universal{''} allophone model, Allosaurus, built with AlloVera, outperforms {``}universal{''} phonemic models and language-specific models on a speech-transcription task. We explore the implications of this technology (and related technologies) for the documentation of endangered and minority languages. We further explore other applications for which AlloVera will be suitable as it grows, including phonological typology.", language = "English", ISBN = "979-10-95546-34-4", } @inproceedings{neubig-etal-2020-summary, title = "A Summary of the First Workshop on Language Technology for Language Documentation and Revitalization", author = "Neubig, Graham and Rijhwani, Shruti and Palmer, Alexis and MacKenzie, Jordan and Cruz, Hilaria and Li, Xinjian and Lee, Matthew and Chaudhary, Aditi and Gessler, Luke and Abney, Steven and Hayati, Shirley Anugrah and Anastasopoulos, Antonios and Zamaraeva, Olga and Prud{'}hommeaux, Emily and Child, Jennette and Child, Sara and Knowles, Rebecca and Moeller, Sarah and Micher, Jeffrey and Li, Yiyuan and Zink, Sydney and Xia, Mengzhou and Sharma, Roshan S and Littell, Patrick", booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)", month = may, year = "2020", address = "Marseille, France", publisher = "European Language Resources association", url = "https://www.aclweb.org/anthology/2020.sltu-1.48", pages = "342--351", abstract = "Despite recent advances in natural language processing and other language technology, the application of such technology to language documentation and conservation has been limited. In August 2019, a workshop was held at Carnegie Mellon University in Pittsburgh, PA, USA to attempt to bring together language community members, documentary linguists, and technologists to discuss how to bridge this gap and create prototypes of novel and practical language revitalization technologies. The workshop focused on developing technologies to aid language documentation and revitalization in four areas: 1) spoken language (speech transcription, phone to orthography decoding, text-to-speech and text-speech forced alignment), 2) dictionary extraction and management, 3) search tools for corpora, and 4) social media (language learning bots and social media analysis). This paper reports the results of this workshop, including issues discussed, and various conceived and implemented technologies for nine languages: Arapaho, Cayuga, Inuktitut, Irish Gaelic, Kidaw{'}ida, Kwak{'}wala, Ojibwe, San Juan Quiahije Chatino, and Seneca.", language = "English", ISBN = "979-10-95546-35-1", } @misc{madaan2020practical, title = {Practical Comparable Data Collection for Low-Resource Languages via Images}, author = {Madaan, Aman and Rijhwani, Shruti and Anastasopoulos, Antonios and Yang, Yiming and Neubig, Graham}, year = {2020}, note = {in ICLR 2020 workshop for AfricaNLP.} } @inproceedings{li20icassp, title = {Universal Phone Recognition with a Multilingual Allophone System}, author = {Xinjian Li and Siddharth Dalmia and Juncheng Li and Patrick Littell and Matthew Lee and Jiali Yao and Antonios Anastasopoulos and David Mortensen and Graham Neubig and Alan Black and Florian Metze}, booktitle = {2020 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, address = {Barcelona}, month = {May}, url = {http://arxiv.org/abs/2002.11800}, year = {2020} } @misc{kurita20towards, title = {Towards Robust Toxic Content Classification}, author = {Kurita, Keita and Belova, Anna and Anastasopoulos, Antonios}, year = {2020}, note = {{arXiv}:1912.06872} } @inproceedings{wang20icml, title = {Optimizing Data Usage via Differentiable Rewards}, author = {Xinyi Wang and Hieu Pham and Paul Michel and Antonios Anastasopoulos and Jaime Carbonell and Graham Neubig}, booktitle = {International Conference on Machine Learning (ICML)}, month = {July}, url = {https://arxiv.org/abs/1911.10088}, year = {2020} } @inproceedings{lin-etal-2019-choosing, title = "Choosing Transfer Languages for Cross-Lingual Learning", author = "Lin, Yu-Hsiang and Chen, Chian-Yu and Lee, Jean and Li, Zirui and Zhang, Yuyan and Xia, Mengzhou and Rijhwani, Shruti and He, Junxian and Zhang, Zhisong and Ma, Xuezhe and Anastasopoulos, Antonios and Littell, Patrick and Neubig, Graham", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1301", doi = "10.18653/v1/P19-1301", pages = "3125--3135", abstract = "Cross-lingual transfer, where a high-resource transfer language is used to improve the accuracy of a low-resource task language, is now an invaluable tool for improving performance of natural language processing (NLP) on low-resource languages. However, given a particular task language, it is not clear which language to transfer from, and the standard strategy is to select languages based on ad hoc criteria, usually the intuition of the experimenter. Since a large number of features contribute to the success of cross-lingual transfer (including phylogenetic similarity, typological properties, lexical overlap, or size of available data), even the most enlightened experimenter rarely considers all these factors for the particular task at hand. In this paper, we consider this task of automatically selecting optimal transfer languages as a ranking problem, and build models that consider the aforementioned features to perform this prediction. In experiments on representative NLP tasks, we demonstrate that our model predicts good transfer languages much better than ad hoc baselines considering single features in isolation, and glean insights on what features are most informative for each different NLP tasks, which may inform future ad hoc selection even without use of our method.", } @inproceedings{xia-etal-2019-generalized, title = "Generalized Data Augmentation for Low-Resource Translation", author = "Xia, Mengzhou and Kong, Xiang and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1579", doi = "10.18653/v1/P19-1579", pages = "5786--5796", abstract = "Low-resource language pairs with a paucity of parallel data pose challenges for machine translation in terms of both adequacy and fluency. Data augmentation utilizing a large amount of monolingual data is regarded as an effective way to alleviate the problem. In this paper, we propose a general framework of data augmentation for low-resource machine translation not only using target-side monolingual data, but also by pivoting through a related high-resource language. Specifically, we experiment with a two-step pivoting method to convert high-resource data to the low-resource language, making best use of available resources to better approximate the true distribution of the low-resource language. First, we inject low-resource words into high-resource sentences through an induced bilingual dictionary. Second, we further edit the high-resource data injected with low-resource words using a modified unsupervised machine translation framework. Extensive experiments on four low-resource datasets show that under extreme low-resource settings, our data augmentation techniques improve translation quality by up to 1.5 to 8 BLEU points compared to supervised back-translation baselines.", } @inproceedings{anastasopoulos-2019-analysis, title = "An Analysis of Source-Side Grammatical Errors in {NMT}", author = "Anastasopoulos, Antonios", booktitle = "Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP", month = aug, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-4822", doi = "10.18653/v1/W19-4822", pages = "213--223", abstract = "The quality of Neural Machine Translation (NMT) has been shown to significantly degrade when confronted with source-side noise. We present the first large-scale study of state-of-the-art English-to-German NMT on real grammatical noise, by evaluating on several Grammar Correction corpora. We present methods for evaluating NMT robustness without true references, and we use them for extensive analysis of the effects that different grammatical errors have on the NMT output. We also introduce a technique for visualizing the divergence distribution caused by a source-side error, which allows for additional insights.", } @inproceedings{li-etal-2019-findings, title = "Findings of the First Shared Task on Machine Translation Robustness", author = "Li, Xian and Michel, Paul and Anastasopoulos, Antonios and Belinkov, Yonatan and Durrani, Nadir and Firat, Orhan and Koehn, Philipp and Neubig, Graham and Pino, Juan and Sajjad, Hassan", booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)", month = aug, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-5303", doi = "10.18653/v1/W19-5303", pages = "91--102", abstract = "We share the findings of the first shared task on improving robustness of Machine Translation (MT). The task provides a testbed representing challenges facing MT models deployed in the real world, and facilitates new approaches to improve models{'} robustness to noisy input and domain mismatch. We focus on two language pairs (English-French and English-Japanese), and the submitted systems are evaluated on a blind test set consisting of noisy comments on Reddit and professionally sourced translations. As a new task, we received 23 submissions by 11 participating teams from universities, companies, national labs, etc. All submitted systems achieved large improvements over baselines, with the best improvement having +22.33 BLEU. We evaluated submissions by both human judgment and automatic evaluation (BLEU), which shows high correlations (Pearson{'}s r = 0.94 and 0.95). Furthermore, we conducted a qualitative analysis of the submitted systems using compare-mt, which revealed their salient differences in handling challenges in this task. Such analysis provides additional insights when there is occasional disagreement between human judgment and BLEU, e.g. systems better at producing colloquial expressions received higher score from human judgment.", } @inproceedings{zhou-etal-2019-improving, title = "Improving Robustness of Neural Machine Translation with Multi-task Learning", author = "Zhou, Shuyan and Zeng, Xiangkai and Zhou, Yingqi and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)", month = aug, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-5368", doi = "10.18653/v1/W19-5368", pages = "565--571", abstract = "While neural machine translation (NMT) achieves remarkable performance on clean, in-domain text, performance is known to degrade drastically when facing text which is full of typos, grammatical errors and other varieties of noise. In this work, we propose a multi-task learning algorithm for transformer-based MT systems that is more resilient to this noise. We describe our submission to the WMT 2019 Robustness shared task based on this method. Our model achieves a BLEU score of 32.8 on the shared task French to English dataset, which is 7.1 BLEU points higher than the baseline vanilla transformer trained with clean text.", } @inproceedings{anastasopoulos-neubig-2019-pushing, title = "Pushing the Limits of Low-Resource Morphological Inflection", author = "Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", month = nov, year = "2019", address = "Hong Kong, China", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/D19-1091", doi = "10.18653/v1/D19-1091", pages = "984--996", abstract = "Recent years have seen exceptional strides in the task of automatic morphological inflection generation. However, for a long tail of languages the necessary resources are hard to come by, and state-of-the-art neural methods that work well under higher resource settings perform poorly in the face of a paucity of data. In response, we propose a battery of improvements that greatly improve performance under such low-resource conditions. First, we present a novel two-step attention architecture for the inflection decoder. In addition, we investigate the effects of cross-lingual transfer from single and multiple languages, as well as monolingual data hallucination. The macro-averaged accuracy of our models outperforms the state-of-the-art by 15 percentage points. Also, we identify the crucial factors for success with cross-lingual transfer for morphological inflection: typological similarity and a common representation across languages.", } @inproceedings{dou-etal-2019-investigating, title = "Investigating Meta-Learning Algorithms for Low-Resource Natural Language Understanding Tasks", author = "Dou, Zi-Yi and Yu, Keyi and Anastasopoulos, Antonios", booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", month = nov, year = "2019", address = "Hong Kong, China", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/D19-1112", doi = "10.18653/v1/D19-1112", pages = "1192--1197", abstract = "Learning general representations of text is a fundamental problem for many natural language understanding (NLU) tasks. Previously, researchers have proposed to use language model pre-training and multi-task learning to learn robust representations. However, these methods can achieve sub-optimal performance in low-resource scenarios. Inspired by the recent success of optimization-based meta-learning algorithms, in this paper, we explore the model-agnostic meta-learning algorithm (MAML) and its variants for low-resource NLU tasks. We validate our methods on the GLUE benchmark and show that our proposed models can outperform several strong baselines. We further empirically demonstrate that the learned representations can be adapted to new tasks efficiently and effectively.", } @inproceedings{dou-etal-2019-unsupervised, title = "Unsupervised Domain Adaptation for Neural Machine Translation with Domain-Aware Feature Embeddings", author = "Dou, Zi-Yi and Hu, Junjie and Anastasopoulos, Antonios and Neubig, Graham", booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", month = nov, year = "2019", address = "Hong Kong, China", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/D19-1147", doi = "10.18653/v1/D19-1147", pages = "1417--1422", abstract = "The recent success of neural machine translation models relies on the availability of high quality, in-domain data. Domain adaptation is required when domain-specific data is scarce or nonexistent. Previous unsupervised domain adaptation strategies include training the model with in-domain copied monolingual or back-translated data. However, these methods use generic representations for text regardless of domain shift, which makes it infeasible for translation models to control outputs conditional on a specific domain. In this work, we propose an approach that adapts models with domain-aware feature embeddings, which are learned via an auxiliary language modeling task. Our approach allows the model to assign domain-specific representations to words and output sentences in the desired domain. Our empirical results demonstrate the effectiveness of the proposed strategy, achieving consistent improvements in multiple experimental settings. In addition, we show that combining our method with back translation can further improve the performance of the model.", } @inproceedings{anastasopoulos-etal-2019-neural, title = "Neural Machine Translation of Text from Non-Native Speakers", author = "Anastasopoulos, Antonios and Lui, Alison and Nguyen, Toan Q. and Chiang, David", booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", month = jun, year = "2019", address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N19-1311", doi = "10.18653/v1/N19-1311", pages = "3070--3080", abstract = "Neural Machine Translation (NMT) systems are known to degrade when confronted with noisy data, especially when the system is trained only on clean data. In this paper, we show that augmenting training data with sentences containing artificially-introduced grammatical errors can make the system more robust to such errors. In combination with an automatic grammar error correction system, we can recover 1.0 BLEU out of 2.4 BLEU lost due to grammatical errors. We also present a set of Spanish translations of the JFLEG grammar error correction corpus, which allows for testing NMT robustness to real grammatical errors.", } @inproceedings{anastasopoulos-etal-2018-part, title = "Part-of-Speech Tagging on an Endangered Language: a Parallel {G}riko-{I}talian Resource", author = "Anastasopoulos, Antonios and Lekakou, Marika and Quer, Josep and Zimianiti, Eleni and DeBenedetto, Justin and Chiang, David", booktitle = "Proceedings of the 27th International Conference on Computational Linguistics", month = aug, year = "2018", address = "Santa Fe, New Mexico, USA", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/C18-1214", pages = "2529--2539", abstract = "Most work on part-of-speech (POS) tagging is focused on high resource languages, or examines low-resource and active learning settings through simulated studies. We evaluate POS tagging techniques on an actual endangered language, Griko. We present a resource that contains 114 narratives in Griko, along with sentence-level translations in Italian, and provides gold annotations for the test set. Based on a previously collected small corpus, we investigate several traditional methods, as well as methods that take advantage of monolingual data or project cross-lingual POS tags. We show that the combination of a semi-supervised method with cross-lingual transfer is more appropriate for this extremely challenging setting, with the best tagger achieving an accuracy of 72.9{\%}. With an applied active learning scheme, which we use to collect sentence-level annotations over the test set, we achieve improvements of more than 21 percentage points.", } @inproceedings{anastasopoulos-chiang-2018-tied, title = "Tied Multitask Learning for Neural Speech Translation", author = "Anastasopoulos, Antonios and Chiang, David", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N18-1008", doi = "10.18653/v1/N18-1008", pages = "82--91", abstract = "We explore multitask models for neural translation of speech, augmenting them in order to reflect two intuitive notions. First, we introduce a model where the second task decoder receives information from the decoder of the first task, since higher-level intermediate representations should provide useful information. Second, we apply regularization that encourages transitivity and invertibility. We show that the application of these notions on jointly trained models improves performance on the tasks of low-resource speech transcription and translation. It also leads to better performance when using attention information for word discovery over unsegmented input.", } @inproceedings{anastasopoulos-etal-2016-unsupervised, title = "An Unsupervised Probability Model for Speech-to-Translation Alignment of Low-Resource Languages", author = "Anastasopoulos, Antonios and Chiang, David and Duong, Long", booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2016", address = "Austin, Texas", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/D16-1133", doi = "10.18653/v1/D16-1133", pages = "1255--1263", } @inproceedings{duong-etal-2016-attentional, title = "An Attentional Model for Speech Translation Without Transcription", author = "Duong, Long and Anastasopoulos, Antonios and Chiang, David and Bird, Steven and Cohn, Trevor", booktitle = "Proceedings of the 2016 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies", month = jun, year = "2016", address = "San Diego, California", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N16-1109", doi = "10.18653/v1/N16-1109", pages = "949--959", } @inproceedings{anastasopoulos2018leveraging, title={Leveraging Translations for Speech Transcription in Low-resource Settings}, author={Anastasopoulos, Antonios and Chiang, David}, booktitle={Proc. Interspeech 2018}, pages={1279--1283}, year={2018}, url={https://www.isca-speech.org/archive/Interspeech_2018/pdfs/2162.pdf} }