| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321 |
- // This file is part of Desktop App Toolkit,
- // a set of libraries for developing nice desktop applications.
- //
- // For license and copyright information please follow this link:
- // https://github.com/desktop-app/legal/blob/master/LEGAL
- //
- #include "spellcheck/spellcheck_utils.h"
- #include "spellcheck/platform/platform_spellcheck.h"
- #include <QtCore/QStringList>
- #include <QTextBoundaryFinder>
- namespace Spellchecker {
- namespace {
- struct SubtagScript {
- const char *subtag;
- QChar::Script script;
- };
- // https://chromium.googlesource.com/chromium/src/+/refs/heads/master/third_party/blink/renderer/platform/text/locale_to_script_mapping.cc
- std::vector<QChar::Script> SupportedScripts;
- rpl::event_stream<> SupportedScriptsEventStream;
- constexpr auto kFactor = 1000;
- constexpr auto kAcuteAccentChars = {
- QChar(769), QChar(833), // QChar(180),
- QChar(714), QChar(779), QChar(733),
- QChar(758), QChar(791), QChar(719),
- };
- constexpr auto kUnspellcheckableScripts = {
- QChar::Script_Katakana,
- QChar::Script_Han,
- };
- constexpr SubtagScript kLocaleScriptList[] = {
- {"aa", QChar::Script_Latin}, {"ab", QChar::Script_Cyrillic},
- {"ady", QChar::Script_Cyrillic}, {"aeb", QChar::Script_Arabic},
- {"af", QChar::Script_Latin}, {"ak", QChar::Script_Latin},
- {"am", QChar::Script_Ethiopic}, {"ar", QChar::Script_Arabic},
- {"arq", QChar::Script_Arabic}, {"ary", QChar::Script_Arabic},
- {"arz", QChar::Script_Arabic}, {"as", QChar::Script_Bengali},
- {"ast", QChar::Script_Latin}, {"av", QChar::Script_Cyrillic},
- {"ay", QChar::Script_Latin}, {"az", QChar::Script_Latin},
- {"azb", QChar::Script_Arabic}, {"ba", QChar::Script_Cyrillic},
- {"bal", QChar::Script_Arabic}, {"be", QChar::Script_Cyrillic},
- {"bej", QChar::Script_Arabic}, {"bg", QChar::Script_Cyrillic},
- {"bi", QChar::Script_Latin}, {"bn", QChar::Script_Bengali},
- {"bo", QChar::Script_Tibetan}, {"bqi", QChar::Script_Arabic},
- {"brh", QChar::Script_Arabic}, {"bs", QChar::Script_Latin},
- {"ca", QChar::Script_Latin}, {"ce", QChar::Script_Cyrillic},
- {"ceb", QChar::Script_Latin}, {"ch", QChar::Script_Latin},
- {"chk", QChar::Script_Latin}, {"cja", QChar::Script_Arabic},
- {"cjm", QChar::Script_Arabic}, {"ckb", QChar::Script_Arabic},
- {"cs", QChar::Script_Latin}, {"cy", QChar::Script_Latin},
- {"da", QChar::Script_Latin}, {"dcc", QChar::Script_Arabic},
- {"de", QChar::Script_Latin}, {"doi", QChar::Script_Arabic},
- {"dv", QChar::Script_Thaana}, {"dyo", QChar::Script_Arabic},
- {"dz", QChar::Script_Tibetan}, {"ee", QChar::Script_Latin},
- {"efi", QChar::Script_Latin}, {"el", QChar::Script_Greek},
- {"en", QChar::Script_Latin}, {"es", QChar::Script_Latin},
- {"et", QChar::Script_Latin}, {"eu", QChar::Script_Latin},
- {"fa", QChar::Script_Arabic}, {"fi", QChar::Script_Latin},
- {"fil", QChar::Script_Latin}, {"fj", QChar::Script_Latin},
- {"fo", QChar::Script_Latin}, {"fr", QChar::Script_Latin},
- {"fur", QChar::Script_Latin}, {"fy", QChar::Script_Latin},
- {"ga", QChar::Script_Latin}, {"gaa", QChar::Script_Latin},
- {"gba", QChar::Script_Arabic}, {"gbz", QChar::Script_Arabic},
- {"gd", QChar::Script_Latin}, {"gil", QChar::Script_Latin},
- {"gl", QChar::Script_Latin}, {"gjk", QChar::Script_Arabic},
- {"gju", QChar::Script_Arabic}, {"glk", QChar::Script_Arabic},
- {"gn", QChar::Script_Latin}, {"gsw", QChar::Script_Latin},
- {"gu", QChar::Script_Gujarati}, {"ha", QChar::Script_Latin},
- {"haw", QChar::Script_Latin}, {"haz", QChar::Script_Arabic},
- {"he", QChar::Script_Hebrew}, {"hi", QChar::Script_Devanagari},
- {"hil", QChar::Script_Latin}, {"hnd", QChar::Script_Arabic},
- {"hno", QChar::Script_Arabic}, {"ho", QChar::Script_Latin},
- {"hr", QChar::Script_Latin}, {"ht", QChar::Script_Latin},
- {"hu", QChar::Script_Latin}, {"hy", QChar::Script_Armenian},
- {"id", QChar::Script_Latin}, {"ig", QChar::Script_Latin},
- {"ii", QChar::Script_Yi}, {"ilo", QChar::Script_Latin},
- {"inh", QChar::Script_Cyrillic}, {"is", QChar::Script_Latin},
- {"it", QChar::Script_Latin}, {"iu", QChar::Script_CanadianAboriginal},
- {"ja", QChar::Script_Katakana}, // or Script_Hiragana.
- {"jv", QChar::Script_Latin}, {"ka", QChar::Script_Georgian},
- {"kaj", QChar::Script_Latin}, {"kam", QChar::Script_Latin},
- {"kbd", QChar::Script_Cyrillic}, {"kha", QChar::Script_Latin},
- {"khw", QChar::Script_Arabic}, {"kk", QChar::Script_Cyrillic},
- {"kl", QChar::Script_Latin}, {"km", QChar::Script_Khmer},
- {"kn", QChar::Script_Kannada}, {"ko", QChar::Script_Hangul},
- {"kok", QChar::Script_Devanagari}, {"kos", QChar::Script_Latin},
- {"kpe", QChar::Script_Latin}, {"krc", QChar::Script_Cyrillic},
- {"ks", QChar::Script_Arabic}, {"ku", QChar::Script_Arabic},
- {"kum", QChar::Script_Cyrillic}, {"kvx", QChar::Script_Arabic},
- {"kxp", QChar::Script_Arabic}, {"ky", QChar::Script_Cyrillic},
- {"la", QChar::Script_Latin}, {"lah", QChar::Script_Arabic},
- {"lb", QChar::Script_Latin}, {"lez", QChar::Script_Cyrillic},
- {"lki", QChar::Script_Arabic}, {"ln", QChar::Script_Latin},
- {"lo", QChar::Script_Lao}, {"lrc", QChar::Script_Arabic},
- {"lt", QChar::Script_Latin}, {"luz", QChar::Script_Arabic},
- {"lv", QChar::Script_Latin}, {"mai", QChar::Script_Devanagari},
- {"mdf", QChar::Script_Cyrillic}, {"mfa", QChar::Script_Arabic},
- {"mg", QChar::Script_Latin}, {"mh", QChar::Script_Latin},
- {"mi", QChar::Script_Latin}, {"mk", QChar::Script_Cyrillic},
- {"ml", QChar::Script_Malayalam}, {"mn", QChar::Script_Cyrillic},
- {"mr", QChar::Script_Devanagari},{"ms", QChar::Script_Latin},
- {"mt", QChar::Script_Latin}, {"mvy", QChar::Script_Arabic},
- {"my", QChar::Script_Myanmar}, {"myv", QChar::Script_Cyrillic},
- {"mzn", QChar::Script_Arabic}, {"na", QChar::Script_Latin},
- {"nb", QChar::Script_Latin}, {"ne", QChar::Script_Devanagari},
- {"niu", QChar::Script_Latin}, {"nl", QChar::Script_Latin},
- {"nn", QChar::Script_Latin}, {"nr", QChar::Script_Latin},
- {"nso", QChar::Script_Latin}, {"ny", QChar::Script_Latin},
- {"oc", QChar::Script_Latin}, {"om", QChar::Script_Latin},
- {"or", QChar::Script_Oriya}, {"os", QChar::Script_Cyrillic},
- {"pa", QChar::Script_Gurmukhi}, {"pag", QChar::Script_Latin},
- {"pap", QChar::Script_Latin}, {"pau", QChar::Script_Latin},
- {"pl", QChar::Script_Latin}, {"pon", QChar::Script_Latin},
- {"prd", QChar::Script_Arabic}, {"prs", QChar::Script_Arabic},
- {"ps", QChar::Script_Arabic}, {"pt", QChar::Script_Latin},
- {"qu", QChar::Script_Latin}, {"rm", QChar::Script_Latin},
- {"rmt", QChar::Script_Arabic}, {"rn", QChar::Script_Latin},
- {"ro", QChar::Script_Latin}, {"ru", QChar::Script_Cyrillic},
- {"rw", QChar::Script_Latin}, {"sa", QChar::Script_Devanagari},
- {"sah", QChar::Script_Cyrillic}, {"sat", QChar::Script_Latin},
- {"sd", QChar::Script_Arabic}, {"sdh", QChar::Script_Arabic},
- {"se", QChar::Script_Latin}, {"sg", QChar::Script_Latin},
- {"shi", QChar::Script_Arabic}, {"si", QChar::Script_Sinhala},
- {"sid", QChar::Script_Latin}, {"sk", QChar::Script_Latin},
- {"skr", QChar::Script_Arabic}, {"sl", QChar::Script_Latin},
- {"sm", QChar::Script_Latin}, {"so", QChar::Script_Latin},
- {"sq", QChar::Script_Latin}, {"sr", QChar::Script_Cyrillic},
- {"ss", QChar::Script_Latin}, {"st", QChar::Script_Latin},
- {"su", QChar::Script_Latin}, {"sus", QChar::Script_Arabic},
- {"sv", QChar::Script_Latin}, {"sw", QChar::Script_Latin},
- {"swb", QChar::Script_Arabic}, {"syr", QChar::Script_Arabic},
- {"ta", QChar::Script_Tamil}, {"te", QChar::Script_Telugu},
- {"tet", QChar::Script_Latin}, {"tg", QChar::Script_Cyrillic},
- {"th", QChar::Script_Thai}, {"ti", QChar::Script_Ethiopic},
- {"tig", QChar::Script_Ethiopic}, {"tk", QChar::Script_Latin},
- {"tkl", QChar::Script_Latin}, {"tl", QChar::Script_Latin},
- {"tn", QChar::Script_Latin}, {"to", QChar::Script_Latin},
- {"tpi", QChar::Script_Latin}, {"tr", QChar::Script_Latin},
- {"trv", QChar::Script_Latin}, {"ts", QChar::Script_Latin},
- {"tt", QChar::Script_Cyrillic}, {"ttt", QChar::Script_Arabic},
- {"tvl", QChar::Script_Latin}, {"tw", QChar::Script_Latin},
- {"ty", QChar::Script_Latin}, {"tyv", QChar::Script_Cyrillic},
- {"udm", QChar::Script_Cyrillic}, {"ug", QChar::Script_Arabic},
- {"uk", QChar::Script_Cyrillic}, {"und", QChar::Script_Latin},
- {"ur", QChar::Script_Arabic}, {"uz", QChar::Script_Cyrillic},
- {"ve", QChar::Script_Latin}, {"vi", QChar::Script_Latin},
- {"wal", QChar::Script_Ethiopic}, {"war", QChar::Script_Latin},
- {"wo", QChar::Script_Latin}, {"xh", QChar::Script_Latin},
- {"yap", QChar::Script_Latin}, {"yo", QChar::Script_Latin},
- {"za", QChar::Script_Latin}, {"zdj", QChar::Script_Arabic},
- {"zh", QChar::Script_Han}, {"zu", QChar::Script_Latin},
- // Encompassed languages within the Chinese macrolanguage.
- // http://www-01.sil.org/iso639-3/documentation.asp?id=zho
- // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.html
- // {"cdo", USCRIPT_SIMPLIFIED_HAN},
- // {"cjy", USCRIPT_SIMPLIFIED_HAN},
- // {"cmn", USCRIPT_SIMPLIFIED_HAN},
- // {"cpx", USCRIPT_SIMPLIFIED_HAN},
- // {"czh", USCRIPT_SIMPLIFIED_HAN},
- // {"czo", USCRIPT_SIMPLIFIED_HAN},
- // {"gan", USCRIPT_SIMPLIFIED_HAN},
- // {"hsn", USCRIPT_SIMPLIFIED_HAN},
- // {"mnp", USCRIPT_SIMPLIFIED_HAN},
- // {"wuu", USCRIPT_SIMPLIFIED_HAN},
- // {"hak", USCRIPT_TRADITIONAL_HAN},
- // {"lzh", USCRIPT_TRADITIONAL_HAN},
- // {"nan", USCRIPT_TRADITIONAL_HAN},
- // {"yue", USCRIPT_TRADITIONAL_HAN},
- // {"zh-cdo", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-cjy", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-cmn", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-cpx", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-czh", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-czo", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-gan", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-hsn", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-mnp", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-wuu", USCRIPT_SIMPLIFIED_HAN},
- // {"zh-hak", USCRIPT_TRADITIONAL_HAN},
- // {"zh-lzh", USCRIPT_TRADITIONAL_HAN},
- // {"zh-nan", USCRIPT_TRADITIONAL_HAN},
- // {"zh-yue", USCRIPT_TRADITIONAL_HAN},
- // // Chinese with regions. Logically, regions should be handled
- // // separately, but this works for the current purposes.
- // {"zh-hk", USCRIPT_TRADITIONAL_HAN},
- // {"zh-mo", USCRIPT_TRADITIONAL_HAN},
- // {"zh-tw", USCRIPT_TRADITIONAL_HAN},
- };
- inline auto IsAcuteAccentChar(const QChar &c) {
- return ranges::contains(kAcuteAccentChars, c);
- }
- inline auto IsSpellcheckableScripts(const QChar::Script &s) {
- return !ranges::contains(kUnspellcheckableScripts, s);
- }
- } // namespace
- QChar::Script LocaleToScriptCode(const QString &locale) {
- const auto subtag = locale.left(
- std::max(locale.indexOf('_'), locale.indexOf('-')));
- for (const auto &kv : kLocaleScriptList) {
- if (subtag == kv.subtag) {
- return kv.script;
- }
- }
- return QChar::Script_Common;
- }
- QChar::Script WordScript(QStringView word) {
- // Find the first letter.
- const auto firstLetter = ranges::find_if(word, [](QChar c) {
- return c.isLetter();
- });
- return firstLetter == word.end()
- ? QChar::Script_Common
- : firstLetter->script();
- }
- bool IsWordSkippable(QStringView word, bool checkSupportedScripts) {
- if (word.size() > kMaxWordSize) {
- return true;
- }
- const auto wordScript = WordScript(word);
- if (checkSupportedScripts
- && !ranges::contains(SupportedScripts, wordScript)) {
- return true;
- }
- return ranges::any_of(word, [&](QChar c) {
- return (c.script() != wordScript)
- && !IsAcuteAccentChar(c)
- && (c.unicode() != '\'') // Patched Qt to make it a non-separator.
- && (c.unicode() != '_'); // This is not a word separator.
- });
- }
- void UpdateSupportedScripts(std::vector<QString> languages) {
- // It should be called at least once from Platform::Spellchecker::Init().
- SupportedScripts = ranges::views::all(
- languages
- ) | ranges::views::transform(
- LocaleToScriptCode
- ) | ranges::views::unique | ranges::views::filter(
- IsSpellcheckableScripts
- ) | ranges::to_vector;
- SupportedScriptsEventStream.fire({});
- }
- rpl::producer<> SupportedScriptsChanged() {
- return SupportedScriptsEventStream.events();
- }
- MisspelledWords RangesFromText(
- const QString &text,
- Fn<bool(const QString &word)> filterCallback) {
- MisspelledWords ranges;
- if (text.isEmpty()) {
- return ranges;
- }
- auto finder = QTextBoundaryFinder(QTextBoundaryFinder::Word, text);
- const auto isEnd = [&] {
- return (finder.toNextBoundary() == -1);
- };
- while (finder.position() < text.length()) {
- if (!finder.boundaryReasons().testFlag(
- QTextBoundaryFinder::StartOfItem)) {
- if (isEnd()) {
- break;
- }
- continue;
- }
- const auto start = finder.position();
- const auto end = finder.toNextBoundary();
- if (end == -1) {
- break;
- }
- const auto length = end - start;
- if (length < 1) {
- continue;
- }
- if (!filterCallback(text.mid(start, length))) {
- ranges.push_back(std::make_pair(start, length));
- }
- if (isEnd()) {
- break;
- }
- }
- return ranges;
- }
- bool CheckSkipAndSpell(const QString &word) {
- return !IsWordSkippable(word)
- && Platform::Spellchecker::CheckSpelling(word);
- }
- QLocale LocaleFromLangId(int langId) {
- if (langId < kFactor) {
- return QLocale(static_cast<QLocale::Language>(langId));
- }
- const auto l = langId / kFactor;
- const auto lang = static_cast<QLocale::Language>(l);
- const auto country = static_cast<QLocale::Country>(langId - l * kFactor);
- return QLocale(lang, country);
- }
- } // namespace Spellchecker
|