spellcheck_utils.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. // This file is part of Desktop App Toolkit,
  2. // a set of libraries for developing nice desktop applications.
  3. //
  4. // For license and copyright information please follow this link:
  5. // https://github.com/desktop-app/legal/blob/master/LEGAL
  6. //
  7. #include "spellcheck/spellcheck_utils.h"
  8. #include "spellcheck/platform/platform_spellcheck.h"
  9. #include <QtCore/QStringList>
  10. #include <QTextBoundaryFinder>
  11. namespace Spellchecker {
  12. namespace {
  13. struct SubtagScript {
  14. const char *subtag;
  15. QChar::Script script;
  16. };
  17. // https://chromium.googlesource.com/chromium/src/+/refs/heads/master/third_party/blink/renderer/platform/text/locale_to_script_mapping.cc
  18. std::vector<QChar::Script> SupportedScripts;
  19. rpl::event_stream<> SupportedScriptsEventStream;
  20. constexpr auto kFactor = 1000;
  21. constexpr auto kAcuteAccentChars = {
  22. QChar(769), QChar(833), // QChar(180),
  23. QChar(714), QChar(779), QChar(733),
  24. QChar(758), QChar(791), QChar(719),
  25. };
  26. constexpr auto kUnspellcheckableScripts = {
  27. QChar::Script_Katakana,
  28. QChar::Script_Han,
  29. };
  30. constexpr SubtagScript kLocaleScriptList[] = {
  31. {"aa", QChar::Script_Latin}, {"ab", QChar::Script_Cyrillic},
  32. {"ady", QChar::Script_Cyrillic}, {"aeb", QChar::Script_Arabic},
  33. {"af", QChar::Script_Latin}, {"ak", QChar::Script_Latin},
  34. {"am", QChar::Script_Ethiopic}, {"ar", QChar::Script_Arabic},
  35. {"arq", QChar::Script_Arabic}, {"ary", QChar::Script_Arabic},
  36. {"arz", QChar::Script_Arabic}, {"as", QChar::Script_Bengali},
  37. {"ast", QChar::Script_Latin}, {"av", QChar::Script_Cyrillic},
  38. {"ay", QChar::Script_Latin}, {"az", QChar::Script_Latin},
  39. {"azb", QChar::Script_Arabic}, {"ba", QChar::Script_Cyrillic},
  40. {"bal", QChar::Script_Arabic}, {"be", QChar::Script_Cyrillic},
  41. {"bej", QChar::Script_Arabic}, {"bg", QChar::Script_Cyrillic},
  42. {"bi", QChar::Script_Latin}, {"bn", QChar::Script_Bengali},
  43. {"bo", QChar::Script_Tibetan}, {"bqi", QChar::Script_Arabic},
  44. {"brh", QChar::Script_Arabic}, {"bs", QChar::Script_Latin},
  45. {"ca", QChar::Script_Latin}, {"ce", QChar::Script_Cyrillic},
  46. {"ceb", QChar::Script_Latin}, {"ch", QChar::Script_Latin},
  47. {"chk", QChar::Script_Latin}, {"cja", QChar::Script_Arabic},
  48. {"cjm", QChar::Script_Arabic}, {"ckb", QChar::Script_Arabic},
  49. {"cs", QChar::Script_Latin}, {"cy", QChar::Script_Latin},
  50. {"da", QChar::Script_Latin}, {"dcc", QChar::Script_Arabic},
  51. {"de", QChar::Script_Latin}, {"doi", QChar::Script_Arabic},
  52. {"dv", QChar::Script_Thaana}, {"dyo", QChar::Script_Arabic},
  53. {"dz", QChar::Script_Tibetan}, {"ee", QChar::Script_Latin},
  54. {"efi", QChar::Script_Latin}, {"el", QChar::Script_Greek},
  55. {"en", QChar::Script_Latin}, {"es", QChar::Script_Latin},
  56. {"et", QChar::Script_Latin}, {"eu", QChar::Script_Latin},
  57. {"fa", QChar::Script_Arabic}, {"fi", QChar::Script_Latin},
  58. {"fil", QChar::Script_Latin}, {"fj", QChar::Script_Latin},
  59. {"fo", QChar::Script_Latin}, {"fr", QChar::Script_Latin},
  60. {"fur", QChar::Script_Latin}, {"fy", QChar::Script_Latin},
  61. {"ga", QChar::Script_Latin}, {"gaa", QChar::Script_Latin},
  62. {"gba", QChar::Script_Arabic}, {"gbz", QChar::Script_Arabic},
  63. {"gd", QChar::Script_Latin}, {"gil", QChar::Script_Latin},
  64. {"gl", QChar::Script_Latin}, {"gjk", QChar::Script_Arabic},
  65. {"gju", QChar::Script_Arabic}, {"glk", QChar::Script_Arabic},
  66. {"gn", QChar::Script_Latin}, {"gsw", QChar::Script_Latin},
  67. {"gu", QChar::Script_Gujarati}, {"ha", QChar::Script_Latin},
  68. {"haw", QChar::Script_Latin}, {"haz", QChar::Script_Arabic},
  69. {"he", QChar::Script_Hebrew}, {"hi", QChar::Script_Devanagari},
  70. {"hil", QChar::Script_Latin}, {"hnd", QChar::Script_Arabic},
  71. {"hno", QChar::Script_Arabic}, {"ho", QChar::Script_Latin},
  72. {"hr", QChar::Script_Latin}, {"ht", QChar::Script_Latin},
  73. {"hu", QChar::Script_Latin}, {"hy", QChar::Script_Armenian},
  74. {"id", QChar::Script_Latin}, {"ig", QChar::Script_Latin},
  75. {"ii", QChar::Script_Yi}, {"ilo", QChar::Script_Latin},
  76. {"inh", QChar::Script_Cyrillic}, {"is", QChar::Script_Latin},
  77. {"it", QChar::Script_Latin}, {"iu", QChar::Script_CanadianAboriginal},
  78. {"ja", QChar::Script_Katakana}, // or Script_Hiragana.
  79. {"jv", QChar::Script_Latin}, {"ka", QChar::Script_Georgian},
  80. {"kaj", QChar::Script_Latin}, {"kam", QChar::Script_Latin},
  81. {"kbd", QChar::Script_Cyrillic}, {"kha", QChar::Script_Latin},
  82. {"khw", QChar::Script_Arabic}, {"kk", QChar::Script_Cyrillic},
  83. {"kl", QChar::Script_Latin}, {"km", QChar::Script_Khmer},
  84. {"kn", QChar::Script_Kannada}, {"ko", QChar::Script_Hangul},
  85. {"kok", QChar::Script_Devanagari}, {"kos", QChar::Script_Latin},
  86. {"kpe", QChar::Script_Latin}, {"krc", QChar::Script_Cyrillic},
  87. {"ks", QChar::Script_Arabic}, {"ku", QChar::Script_Arabic},
  88. {"kum", QChar::Script_Cyrillic}, {"kvx", QChar::Script_Arabic},
  89. {"kxp", QChar::Script_Arabic}, {"ky", QChar::Script_Cyrillic},
  90. {"la", QChar::Script_Latin}, {"lah", QChar::Script_Arabic},
  91. {"lb", QChar::Script_Latin}, {"lez", QChar::Script_Cyrillic},
  92. {"lki", QChar::Script_Arabic}, {"ln", QChar::Script_Latin},
  93. {"lo", QChar::Script_Lao}, {"lrc", QChar::Script_Arabic},
  94. {"lt", QChar::Script_Latin}, {"luz", QChar::Script_Arabic},
  95. {"lv", QChar::Script_Latin}, {"mai", QChar::Script_Devanagari},
  96. {"mdf", QChar::Script_Cyrillic}, {"mfa", QChar::Script_Arabic},
  97. {"mg", QChar::Script_Latin}, {"mh", QChar::Script_Latin},
  98. {"mi", QChar::Script_Latin}, {"mk", QChar::Script_Cyrillic},
  99. {"ml", QChar::Script_Malayalam}, {"mn", QChar::Script_Cyrillic},
  100. {"mr", QChar::Script_Devanagari},{"ms", QChar::Script_Latin},
  101. {"mt", QChar::Script_Latin}, {"mvy", QChar::Script_Arabic},
  102. {"my", QChar::Script_Myanmar}, {"myv", QChar::Script_Cyrillic},
  103. {"mzn", QChar::Script_Arabic}, {"na", QChar::Script_Latin},
  104. {"nb", QChar::Script_Latin}, {"ne", QChar::Script_Devanagari},
  105. {"niu", QChar::Script_Latin}, {"nl", QChar::Script_Latin},
  106. {"nn", QChar::Script_Latin}, {"nr", QChar::Script_Latin},
  107. {"nso", QChar::Script_Latin}, {"ny", QChar::Script_Latin},
  108. {"oc", QChar::Script_Latin}, {"om", QChar::Script_Latin},
  109. {"or", QChar::Script_Oriya}, {"os", QChar::Script_Cyrillic},
  110. {"pa", QChar::Script_Gurmukhi}, {"pag", QChar::Script_Latin},
  111. {"pap", QChar::Script_Latin}, {"pau", QChar::Script_Latin},
  112. {"pl", QChar::Script_Latin}, {"pon", QChar::Script_Latin},
  113. {"prd", QChar::Script_Arabic}, {"prs", QChar::Script_Arabic},
  114. {"ps", QChar::Script_Arabic}, {"pt", QChar::Script_Latin},
  115. {"qu", QChar::Script_Latin}, {"rm", QChar::Script_Latin},
  116. {"rmt", QChar::Script_Arabic}, {"rn", QChar::Script_Latin},
  117. {"ro", QChar::Script_Latin}, {"ru", QChar::Script_Cyrillic},
  118. {"rw", QChar::Script_Latin}, {"sa", QChar::Script_Devanagari},
  119. {"sah", QChar::Script_Cyrillic}, {"sat", QChar::Script_Latin},
  120. {"sd", QChar::Script_Arabic}, {"sdh", QChar::Script_Arabic},
  121. {"se", QChar::Script_Latin}, {"sg", QChar::Script_Latin},
  122. {"shi", QChar::Script_Arabic}, {"si", QChar::Script_Sinhala},
  123. {"sid", QChar::Script_Latin}, {"sk", QChar::Script_Latin},
  124. {"skr", QChar::Script_Arabic}, {"sl", QChar::Script_Latin},
  125. {"sm", QChar::Script_Latin}, {"so", QChar::Script_Latin},
  126. {"sq", QChar::Script_Latin}, {"sr", QChar::Script_Cyrillic},
  127. {"ss", QChar::Script_Latin}, {"st", QChar::Script_Latin},
  128. {"su", QChar::Script_Latin}, {"sus", QChar::Script_Arabic},
  129. {"sv", QChar::Script_Latin}, {"sw", QChar::Script_Latin},
  130. {"swb", QChar::Script_Arabic}, {"syr", QChar::Script_Arabic},
  131. {"ta", QChar::Script_Tamil}, {"te", QChar::Script_Telugu},
  132. {"tet", QChar::Script_Latin}, {"tg", QChar::Script_Cyrillic},
  133. {"th", QChar::Script_Thai}, {"ti", QChar::Script_Ethiopic},
  134. {"tig", QChar::Script_Ethiopic}, {"tk", QChar::Script_Latin},
  135. {"tkl", QChar::Script_Latin}, {"tl", QChar::Script_Latin},
  136. {"tn", QChar::Script_Latin}, {"to", QChar::Script_Latin},
  137. {"tpi", QChar::Script_Latin}, {"tr", QChar::Script_Latin},
  138. {"trv", QChar::Script_Latin}, {"ts", QChar::Script_Latin},
  139. {"tt", QChar::Script_Cyrillic}, {"ttt", QChar::Script_Arabic},
  140. {"tvl", QChar::Script_Latin}, {"tw", QChar::Script_Latin},
  141. {"ty", QChar::Script_Latin}, {"tyv", QChar::Script_Cyrillic},
  142. {"udm", QChar::Script_Cyrillic}, {"ug", QChar::Script_Arabic},
  143. {"uk", QChar::Script_Cyrillic}, {"und", QChar::Script_Latin},
  144. {"ur", QChar::Script_Arabic}, {"uz", QChar::Script_Cyrillic},
  145. {"ve", QChar::Script_Latin}, {"vi", QChar::Script_Latin},
  146. {"wal", QChar::Script_Ethiopic}, {"war", QChar::Script_Latin},
  147. {"wo", QChar::Script_Latin}, {"xh", QChar::Script_Latin},
  148. {"yap", QChar::Script_Latin}, {"yo", QChar::Script_Latin},
  149. {"za", QChar::Script_Latin}, {"zdj", QChar::Script_Arabic},
  150. {"zh", QChar::Script_Han}, {"zu", QChar::Script_Latin},
  151. // Encompassed languages within the Chinese macrolanguage.
  152. // http://www-01.sil.org/iso639-3/documentation.asp?id=zho
  153. // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.html
  154. // {"cdo", USCRIPT_SIMPLIFIED_HAN},
  155. // {"cjy", USCRIPT_SIMPLIFIED_HAN},
  156. // {"cmn", USCRIPT_SIMPLIFIED_HAN},
  157. // {"cpx", USCRIPT_SIMPLIFIED_HAN},
  158. // {"czh", USCRIPT_SIMPLIFIED_HAN},
  159. // {"czo", USCRIPT_SIMPLIFIED_HAN},
  160. // {"gan", USCRIPT_SIMPLIFIED_HAN},
  161. // {"hsn", USCRIPT_SIMPLIFIED_HAN},
  162. // {"mnp", USCRIPT_SIMPLIFIED_HAN},
  163. // {"wuu", USCRIPT_SIMPLIFIED_HAN},
  164. // {"hak", USCRIPT_TRADITIONAL_HAN},
  165. // {"lzh", USCRIPT_TRADITIONAL_HAN},
  166. // {"nan", USCRIPT_TRADITIONAL_HAN},
  167. // {"yue", USCRIPT_TRADITIONAL_HAN},
  168. // {"zh-cdo", USCRIPT_SIMPLIFIED_HAN},
  169. // {"zh-cjy", USCRIPT_SIMPLIFIED_HAN},
  170. // {"zh-cmn", USCRIPT_SIMPLIFIED_HAN},
  171. // {"zh-cpx", USCRIPT_SIMPLIFIED_HAN},
  172. // {"zh-czh", USCRIPT_SIMPLIFIED_HAN},
  173. // {"zh-czo", USCRIPT_SIMPLIFIED_HAN},
  174. // {"zh-gan", USCRIPT_SIMPLIFIED_HAN},
  175. // {"zh-hsn", USCRIPT_SIMPLIFIED_HAN},
  176. // {"zh-mnp", USCRIPT_SIMPLIFIED_HAN},
  177. // {"zh-wuu", USCRIPT_SIMPLIFIED_HAN},
  178. // {"zh-hak", USCRIPT_TRADITIONAL_HAN},
  179. // {"zh-lzh", USCRIPT_TRADITIONAL_HAN},
  180. // {"zh-nan", USCRIPT_TRADITIONAL_HAN},
  181. // {"zh-yue", USCRIPT_TRADITIONAL_HAN},
  182. // // Chinese with regions. Logically, regions should be handled
  183. // // separately, but this works for the current purposes.
  184. // {"zh-hk", USCRIPT_TRADITIONAL_HAN},
  185. // {"zh-mo", USCRIPT_TRADITIONAL_HAN},
  186. // {"zh-tw", USCRIPT_TRADITIONAL_HAN},
  187. };
  188. inline auto IsAcuteAccentChar(const QChar &c) {
  189. return ranges::contains(kAcuteAccentChars, c);
  190. }
  191. inline auto IsSpellcheckableScripts(const QChar::Script &s) {
  192. return !ranges::contains(kUnspellcheckableScripts, s);
  193. }
  194. } // namespace
  195. QChar::Script LocaleToScriptCode(const QString &locale) {
  196. const auto subtag = locale.left(
  197. std::max(locale.indexOf('_'), locale.indexOf('-')));
  198. for (const auto &kv : kLocaleScriptList) {
  199. if (subtag == kv.subtag) {
  200. return kv.script;
  201. }
  202. }
  203. return QChar::Script_Common;
  204. }
  205. QChar::Script WordScript(QStringView word) {
  206. // Find the first letter.
  207. const auto firstLetter = ranges::find_if(word, [](QChar c) {
  208. return c.isLetter();
  209. });
  210. return firstLetter == word.end()
  211. ? QChar::Script_Common
  212. : firstLetter->script();
  213. }
  214. bool IsWordSkippable(QStringView word, bool checkSupportedScripts) {
  215. if (word.size() > kMaxWordSize) {
  216. return true;
  217. }
  218. const auto wordScript = WordScript(word);
  219. if (checkSupportedScripts
  220. && !ranges::contains(SupportedScripts, wordScript)) {
  221. return true;
  222. }
  223. return ranges::any_of(word, [&](QChar c) {
  224. return (c.script() != wordScript)
  225. && !IsAcuteAccentChar(c)
  226. && (c.unicode() != '\'') // Patched Qt to make it a non-separator.
  227. && (c.unicode() != '_'); // This is not a word separator.
  228. });
  229. }
  230. void UpdateSupportedScripts(std::vector<QString> languages) {
  231. // It should be called at least once from Platform::Spellchecker::Init().
  232. SupportedScripts = ranges::views::all(
  233. languages
  234. ) | ranges::views::transform(
  235. LocaleToScriptCode
  236. ) | ranges::views::unique | ranges::views::filter(
  237. IsSpellcheckableScripts
  238. ) | ranges::to_vector;
  239. SupportedScriptsEventStream.fire({});
  240. }
  241. rpl::producer<> SupportedScriptsChanged() {
  242. return SupportedScriptsEventStream.events();
  243. }
  244. MisspelledWords RangesFromText(
  245. const QString &text,
  246. Fn<bool(const QString &word)> filterCallback) {
  247. MisspelledWords ranges;
  248. if (text.isEmpty()) {
  249. return ranges;
  250. }
  251. auto finder = QTextBoundaryFinder(QTextBoundaryFinder::Word, text);
  252. const auto isEnd = [&] {
  253. return (finder.toNextBoundary() == -1);
  254. };
  255. while (finder.position() < text.length()) {
  256. if (!finder.boundaryReasons().testFlag(
  257. QTextBoundaryFinder::StartOfItem)) {
  258. if (isEnd()) {
  259. break;
  260. }
  261. continue;
  262. }
  263. const auto start = finder.position();
  264. const auto end = finder.toNextBoundary();
  265. if (end == -1) {
  266. break;
  267. }
  268. const auto length = end - start;
  269. if (length < 1) {
  270. continue;
  271. }
  272. if (!filterCallback(text.mid(start, length))) {
  273. ranges.push_back(std::make_pair(start, length));
  274. }
  275. if (isEnd()) {
  276. break;
  277. }
  278. }
  279. return ranges;
  280. }
  281. bool CheckSkipAndSpell(const QString &word) {
  282. return !IsWordSkippable(word)
  283. && Platform::Spellchecker::CheckSpelling(word);
  284. }
  285. QLocale LocaleFromLangId(int langId) {
  286. if (langId < kFactor) {
  287. return QLocale(static_cast<QLocale::Language>(langId));
  288. }
  289. const auto l = langId / kFactor;
  290. const auto lang = static_cast<QLocale::Language>(l);
  291. const auto country = static_cast<QLocale::Country>(langId - l * kFactor);
  292. return QLocale(lang, country);
  293. }
  294. } // namespace Spellchecker