index.js 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. var TokenStream = require('../common/TokenStream');
  2. var adoptBuffer = require('../common/adopt-buffer');
  3. var constants = require('./const');
  4. var TYPE = constants.TYPE;
  5. var CHARCODE = constants.CHARCODE;
  6. var INPUT_STREAM_CODE = constants.INPUT_STREAM_CODE;
  7. var INPUT_STREAM_CODE_STRING = constants.INPUT_STREAM_CODE_STRING;
  8. var INPUT_STREAM_CODE_URL = constants.INPUT_STREAM_CODE_URL;
  9. var INPUT_STREAM_CODE_TYPE = constants.INPUT_STREAM_CODE_TYPE;
  10. var utils = require('./utils');
  11. var firstCharOffset = utils.firstCharOffset;
  12. var cmpStr = utils.cmpStr;
  13. var getNewlineLength = utils.getNewlineLength;
  14. var isNewline = utils.isNewline;
  15. var isName = utils.isName;
  16. var isValidEscape = utils.isValidEscape;
  17. var isNumberStart = utils.isNumberStart;
  18. var isIdentifierStart = utils.isIdentifierStart;
  19. var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
  20. var consumeEscaped = utils.consumeEscaped;
  21. var consumeName = utils.consumeName;
  22. var consumeNumber = utils.consumeNumber;
  23. var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
  24. var ASTERISK = CHARCODE.Asterisk;
  25. var HYPHENMINUS = CHARCODE.HyphenMinus;
  26. var GREATERTHANSIGN = CHARCODE.GreaterThanSign;
  27. var EXCLAMATIONMARK = CHARCODE.ExclamationMark;
  28. var PERCENTSIGN = CHARCODE.PercentSign;
  29. var OFFSET_MASK = 0x00FFFFFF;
  30. var TYPE_SHIFT = 24;
  31. function tokenize(source, stream) {
  32. function getCharCode(offset) {
  33. return offset < sourceLength ? source.charCodeAt(offset) : 0;
  34. }
  35. // § 4.3.3. Consume a numeric token
  36. function consumeNumericToken() {
  37. // Consume a number and let number be the result.
  38. offset = consumeNumber(source, offset);
  39. // If the next 3 input code points would start an identifier, then:
  40. if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
  41. // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
  42. // Consume a name. Set the <dimension-token>’s unit to the returned value.
  43. // Return the <dimension-token>.
  44. type = TYPE.Dimension;
  45. offset = consumeName(source, offset);
  46. return;
  47. }
  48. // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
  49. if (getCharCode(offset) === PERCENTSIGN) {
  50. // Create a <percentage-token> with the same value as number, and return it.
  51. type = TYPE.Percentage;
  52. offset++;
  53. return;
  54. }
  55. // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
  56. type = TYPE.Number;
  57. }
  58. // § 4.3.4. Consume an ident-like token
  59. function consumeIdentLikeToken() {
  60. const nameStartOffset = offset;
  61. // Consume a name, and let string be the result.
  62. offset = consumeName(source, offset);
  63. // If string’s value is an ASCII case-insensitive match for "url",
  64. // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  65. if (cmpStr(source, nameStartOffset, offset, 'url') &&
  66. getCharCode(offset) === 0x0028) {
  67. // While the next two input code points are whitespace, consume the next input code point.
  68. // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
  69. // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
  70. // then create a <function-token> with its value set to string and return it.
  71. offset = findWhiteSpaceEnd(source, offset + 1);
  72. if (getCharCode(offset) === 0x0022 ||
  73. getCharCode(offset) === 0x0027) {
  74. type = TYPE.Function;
  75. offset = nameStartOffset + 4;
  76. return;
  77. }
  78. // Otherwise, consume a url token, and return it.
  79. consumeUrlToken();
  80. return;
  81. }
  82. // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  83. // Create a <function-token> with its value set to string and return it.
  84. if (getCharCode(offset) === 0x0028) {
  85. type = TYPE.Function;
  86. offset++;
  87. return;
  88. }
  89. // Otherwise, create an <ident-token> with its value set to string and return it.
  90. type = TYPE.Ident;
  91. }
  92. // § 4.3.5. Consume a string token
  93. function consumeStringToken(endingCodePoint) {
  94. // This algorithm may be called with an ending code point, which denotes the code point
  95. // that ends the string. If an ending code point is not specified,
  96. // the current input code point is used.
  97. if (!endingCodePoint) {
  98. endingCodePoint = getCharCode(offset++);
  99. }
  100. // Initially create a <string-token> with its value set to the empty string.
  101. type = TYPE.String;
  102. // Repeatedly consume the next input code point from the stream:
  103. for (; offset < source.length; offset++) {
  104. var code = source.charCodeAt(offset);
  105. switch (code < 0x80 ? INPUT_STREAM_CODE_STRING[code] : code) {
  106. // ending code point
  107. case endingCodePoint:
  108. // Return the <string-token>.
  109. offset++;
  110. return;
  111. // EOF
  112. case INPUT_STREAM_CODE_TYPE.Eof:
  113. // This is a parse error. Return the <string-token>.
  114. return;
  115. // newline
  116. case INPUT_STREAM_CODE_TYPE.Newline:
  117. // This is a parse error. Reconsume the current input code point,
  118. // create a <bad-string-token>, and return it.
  119. offset += getNewlineLength(source, offset, code);
  120. type = TYPE.BadString;
  121. return;
  122. // U+005C REVERSE SOLIDUS (\)
  123. case 0x005C:
  124. // If the next input code point is EOF, do nothing.
  125. if (offset === source.length - 1) {
  126. break;
  127. }
  128. var nextCode = getCharCode(offset + 1);
  129. // Otherwise, if the next input code point is a newline, consume it.
  130. if (isNewline(nextCode)) {
  131. offset += getNewlineLength(source, offset + 1, nextCode);
  132. } else if (isValidEscape(code, nextCode)) {
  133. // Otherwise, (the stream starts with a valid escape) consume
  134. // an escaped code point and append the returned code point to
  135. // the <string-token>’s value.
  136. offset = consumeEscaped(source, offset) - 1;
  137. }
  138. break;
  139. // anything else
  140. // Append the current input code point to the <string-token>’s value.
  141. }
  142. }
  143. }
  144. // § 4.3.6. Consume a url token
  145. // Note: This algorithm assumes that the initial "url(" has already been consumed.
  146. // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
  147. // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
  148. // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
  149. function consumeUrlToken() {
  150. // Initially create a <url-token> with its value set to the empty string.
  151. type = TYPE.Url;
  152. // Consume as much whitespace as possible.
  153. offset = findWhiteSpaceEnd(source, offset);
  154. // Repeatedly consume the next input code point from the stream:
  155. for (; offset < source.length; offset++) {
  156. var code = source.charCodeAt(offset);
  157. switch (code < 0x80 ? INPUT_STREAM_CODE_URL[code] : code) {
  158. // U+0029 RIGHT PARENTHESIS ())
  159. case 0x0029:
  160. // Return the <url-token>.
  161. offset++;
  162. return;
  163. // EOF
  164. case INPUT_STREAM_CODE_TYPE.Eof:
  165. // This is a parse error. Return the <url-token>.
  166. return;
  167. // whitespace
  168. case INPUT_STREAM_CODE_TYPE.WhiteSpace:
  169. // Consume as much whitespace as possible.
  170. offset = findWhiteSpaceEnd(source, offset);
  171. // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
  172. // consume it and return the <url-token>
  173. // (if EOF was encountered, this is a parse error);
  174. if (getCharCode(offset) === 0x0029 || offset >= source.length) {
  175. if (offset < source.length) {
  176. offset++;
  177. }
  178. return;
  179. }
  180. // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
  181. // and return it.
  182. offset = consumeBadUrlRemnants(source, offset);
  183. type = TYPE.BadUrl;
  184. return;
  185. // U+0022 QUOTATION MARK (")
  186. // U+0027 APOSTROPHE (')
  187. // U+0028 LEFT PARENTHESIS (()
  188. // non-printable code point
  189. case 0x0022:
  190. case 0x0027:
  191. case 0x0028:
  192. case INPUT_STREAM_CODE_TYPE.NonPrintable:
  193. // This is a parse error. Consume the remnants of a bad url,
  194. // create a <bad-url-token>, and return it.
  195. offset = consumeBadUrlRemnants(source, offset);
  196. type = TYPE.BadUrl;
  197. return;
  198. // U+005C REVERSE SOLIDUS (\)
  199. case 0x005C:
  200. // If the stream starts with a valid escape, consume an escaped code point and
  201. // append the returned code point to the <url-token>’s value.
  202. if (isValidEscape(code, getCharCode(offset + 1))) {
  203. offset = consumeEscaped(source, offset) - 1;
  204. break;
  205. }
  206. // Otherwise, this is a parse error. Consume the remnants of a bad url,
  207. // create a <bad-url-token>, and return it.
  208. offset = consumeBadUrlRemnants(source, offset);
  209. type = TYPE.BadUrl;
  210. return;
  211. // anything else
  212. // Append the current input code point to the <url-token>’s value.
  213. }
  214. }
  215. }
  216. if (!stream) {
  217. stream = new TokenStream();
  218. }
  219. // ensure source is a string
  220. source = String(source || '');
  221. var start = firstCharOffset(source);
  222. var sourceLength = source.length;
  223. var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token
  224. var balance = adoptBuffer(stream.balance, sourceLength + 1);
  225. var tokenCount = 0;
  226. var offset = start;
  227. var balanceCloseType = 0;
  228. var balanceStart = 0;
  229. var balancePrev = 0;
  230. // https://drafts.csswg.org/css-syntax-3/#consume-token
  231. // § 4.3.1. Consume a token
  232. while (offset < sourceLength) {
  233. var code = source.charCodeAt(offset);
  234. var type = 0;
  235. balance[tokenCount] = sourceLength;
  236. switch (code < 0x80 ? INPUT_STREAM_CODE[code] : INPUT_STREAM_CODE_TYPE.NameStart) {
  237. // whitespace
  238. case INPUT_STREAM_CODE_TYPE.WhiteSpace:
  239. // Consume as much whitespace as possible. Return a <whitespace-token>.
  240. type = TYPE.WhiteSpace;
  241. offset = findWhiteSpaceEnd(source, offset + 1);
  242. break;
  243. // U+0022 QUOTATION MARK (")
  244. case 0x0022:
  245. // Consume a string token and return it.
  246. consumeStringToken();
  247. break;
  248. // U+0023 NUMBER SIGN (#)
  249. case 0x0023:
  250. // If the next input code point is a name code point or the next two input code points are a valid escape, then:
  251. if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
  252. // Create a <hash-token>.
  253. type = TYPE.Hash;
  254. // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
  255. // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  256. // // TODO: set id flag
  257. // }
  258. // Consume a name, and set the <hash-token>’s value to the returned string.
  259. offset = consumeName(source, offset + 1);
  260. // Return the <hash-token>.
  261. } else {
  262. // Otherwise, return a <delim-token> with its value set to the current input code point.
  263. type = TYPE.Delim;
  264. offset++;
  265. }
  266. break;
  267. // U+0027 APOSTROPHE (')
  268. case 0x0027:
  269. // Consume a string token and return it.
  270. consumeStringToken();
  271. break;
  272. // U+0028 LEFT PARENTHESIS (()
  273. case 0x0028:
  274. // Return a <(-token>.
  275. type = TYPE.LeftParenthesis;
  276. offset++;
  277. break;
  278. // U+0029 RIGHT PARENTHESIS ())
  279. case 0x0029:
  280. // Return a <)-token>.
  281. type = TYPE.RightParenthesis;
  282. offset++;
  283. break;
  284. // U+002B PLUS SIGN (+)
  285. case 0x002B:
  286. // If the input stream starts with a number, ...
  287. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  288. // ... reconsume the current input code point, consume a numeric token, and return it.
  289. consumeNumericToken();
  290. } else {
  291. // Otherwise, return a <delim-token> with its value set to the current input code point.
  292. type = TYPE.Delim;
  293. offset++;
  294. }
  295. break;
  296. // U+002C COMMA (,)
  297. case 0x002C:
  298. // Return a <comma-token>.
  299. type = TYPE.Comma;
  300. offset++;
  301. break;
  302. // U+002D HYPHEN-MINUS (-)
  303. case 0x002D:
  304. // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
  305. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  306. consumeNumericToken();
  307. } else {
  308. // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
  309. if (getCharCode(offset + 1) === HYPHENMINUS &&
  310. getCharCode(offset + 2) === GREATERTHANSIGN) {
  311. type = TYPE.CDC;
  312. offset = offset + 3;
  313. } else {
  314. // Otherwise, if the input stream starts with an identifier, ...
  315. if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  316. // ... reconsume the current input code point, consume an ident-like token, and return it.
  317. consumeIdentLikeToken();
  318. } else {
  319. // Otherwise, return a <delim-token> with its value set to the current input code point.
  320. type = TYPE.Delim;
  321. offset++;
  322. }
  323. }
  324. }
  325. break;
  326. // U+002E FULL STOP (.)
  327. case 0x002E:
  328. // If the input stream starts with a number, ...
  329. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  330. // ... reconsume the current input code point, consume a numeric token, and return it.
  331. consumeNumericToken();
  332. } else {
  333. // Otherwise, return a <delim-token> with its value set to the current input code point.
  334. type = TYPE.Delim;
  335. offset++;
  336. }
  337. break;
  338. // U+002F SOLIDUS (/)
  339. case 0x002F:
  340. // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
  341. if (getCharCode(offset + 1) === ASTERISK) {
  342. // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
  343. // followed by a U+002F SOLIDUS (/), or up to an EOF code point.
  344. type = TYPE.Comment;
  345. offset = source.indexOf('*/', offset + 2) + 2;
  346. if (offset === 1) {
  347. offset = source.length;
  348. }
  349. } else {
  350. type = TYPE.Delim;
  351. offset++;
  352. }
  353. break;
  354. // U+003A COLON (:)
  355. case 0x003A:
  356. // Return a <colon-token>.
  357. type = TYPE.Colon;
  358. offset++;
  359. break;
  360. // U+003B SEMICOLON (;)
  361. case 0x003B:
  362. // Return a <semicolon-token>.
  363. type = TYPE.Semicolon;
  364. offset++;
  365. break;
  366. // U+003C LESS-THAN SIGN (<)
  367. case 0x003C:
  368. // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
  369. if (getCharCode(offset + 1) === EXCLAMATIONMARK &&
  370. getCharCode(offset + 2) === HYPHENMINUS &&
  371. getCharCode(offset + 3) === HYPHENMINUS) {
  372. // ... consume them and return a <CDO-token>.
  373. type = TYPE.CDO;
  374. offset = offset + 4;
  375. } else {
  376. // Otherwise, return a <delim-token> with its value set to the current input code point.
  377. type = TYPE.Delim;
  378. offset++;
  379. }
  380. break;
  381. // U+0040 COMMERCIAL AT (@)
  382. case 0x0040:
  383. // If the next 3 input code points would start an identifier, ...
  384. if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  385. // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
  386. type = TYPE.AtKeyword;
  387. offset = consumeName(source, offset + 1);
  388. } else {
  389. // Otherwise, return a <delim-token> with its value set to the current input code point.
  390. type = TYPE.Delim;
  391. offset++;
  392. }
  393. break;
  394. // U+005B LEFT SQUARE BRACKET ([)
  395. case 0x005B:
  396. // Return a <[-token>.
  397. type = TYPE.LeftSquareBracket;
  398. offset++;
  399. break;
  400. // U+005C REVERSE SOLIDUS (\)
  401. case 0x005C:
  402. // If the input stream starts with a valid escape, ...
  403. if (isValidEscape(code, getCharCode(offset + 1))) {
  404. // ... reconsume the current input code point, consume an ident-like token, and return it.
  405. consumeIdentLikeToken();
  406. } else {
  407. // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
  408. type = TYPE.Delim;
  409. offset++;
  410. }
  411. break;
  412. // U+005D RIGHT SQUARE BRACKET (])
  413. case 0x005D:
  414. // Return a <]-token>.
  415. type = TYPE.RightSquareBracket;
  416. offset++;
  417. break;
  418. // U+007B LEFT CURLY BRACKET ({)
  419. case 0x007B:
  420. // Return a <{-token>.
  421. type = TYPE.LeftCurlyBracket;
  422. offset++;
  423. break;
  424. // U+007D RIGHT CURLY BRACKET (})
  425. case 0x007D:
  426. // Return a <}-token>.
  427. type = TYPE.RightCurlyBracket;
  428. offset++;
  429. break;
  430. // digit
  431. case INPUT_STREAM_CODE_TYPE.Digit:
  432. // Reconsume the current input code point, consume a numeric token, and return it.
  433. consumeNumericToken();
  434. break;
  435. // name-start code point
  436. case INPUT_STREAM_CODE_TYPE.NameStart:
  437. // Reconsume the current input code point, consume an ident-like token, and return it.
  438. consumeIdentLikeToken();
  439. break;
  440. // EOF
  441. case INPUT_STREAM_CODE_TYPE.Eof:
  442. // Return an <EOF-token>.
  443. break;
  444. // anything else
  445. default:
  446. // Return a <delim-token> with its value set to the current input code point.
  447. type = TYPE.Delim;
  448. offset++;
  449. }
  450. switch (type) {
  451. case balanceCloseType:
  452. balancePrev = balanceStart & OFFSET_MASK;
  453. balanceStart = balance[balancePrev];
  454. balanceCloseType = balanceStart >> TYPE_SHIFT;
  455. balance[tokenCount] = balancePrev;
  456. balance[balancePrev++] = tokenCount;
  457. for (; balancePrev < tokenCount; balancePrev++) {
  458. if (balance[balancePrev] === sourceLength) {
  459. balance[balancePrev] = tokenCount;
  460. }
  461. }
  462. break;
  463. case TYPE.LeftParenthesis:
  464. case TYPE.Function:
  465. balance[tokenCount] = balanceStart;
  466. balanceCloseType = TYPE.RightParenthesis;
  467. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  468. break;
  469. case TYPE.LeftSquareBracket:
  470. balance[tokenCount] = balanceStart;
  471. balanceCloseType = TYPE.RightSquareBracket;
  472. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  473. break;
  474. case TYPE.LeftCurlyBracket:
  475. balance[tokenCount] = balanceStart;
  476. balanceCloseType = TYPE.RightCurlyBracket;
  477. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  478. break;
  479. }
  480. offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset;
  481. }
  482. // finalize buffers
  483. offsetAndType[tokenCount] = offset;
  484. balance[tokenCount] = sourceLength;
  485. balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
  486. while (balanceStart !== 0) {
  487. balancePrev = balanceStart & OFFSET_MASK;
  488. balanceStart = balance[balancePrev];
  489. balance[balancePrev] = sourceLength;
  490. }
  491. // update stream
  492. stream.source = source;
  493. stream.firstCharOffset = start;
  494. stream.offsetAndType = offsetAndType;
  495. stream.tokenCount = tokenCount;
  496. stream.balance = balance;
  497. stream.reset();
  498. stream.next();
  499. return stream;
  500. }
  501. //
  502. // tokenizer
  503. //
  504. // extend tokenizer with constants
  505. Object.keys(constants).forEach(function(key) {
  506. tokenize[key] = constants[key];
  507. });
  508. // extend tokenizer with static methods from utils
  509. Object.keys(utils).forEach(function(key) {
  510. tokenize[key] = utils[key];
  511. });
  512. module.exports = tokenize;