SaxParser.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /**
  2. * SaxParser.js
  3. *
  4. * Copyright, Moxiecode Systems AB
  5. * Released under LGPL License.
  6. *
  7. * License: http://www.tinymce.com/license
  8. * Contributing: http://www.tinymce.com/contributing
  9. */
  10. /*eslint max-depth:[2, 9] */
  11. /**
  12. * This class parses HTML code using pure JavaScript and executes various events for each item it finds. It will
  13. * always execute the events in the right order for tag soup code like <b><p></b></p>. It will also remove elements
  14. * and attributes that doesn't fit the schema if the validate setting is enabled.
  15. *
  16. * @example
  17. * var parser = new tinymce.html.SaxParser({
  18. * validate: true,
  19. *
  20. * comment: function(text) {
  21. * console.log('Comment:', text);
  22. * },
  23. *
  24. * cdata: function(text) {
  25. * console.log('CDATA:', text);
  26. * },
  27. *
  28. * text: function(text, raw) {
  29. * console.log('Text:', text, 'Raw:', raw);
  30. * },
  31. *
  32. * start: function(name, attrs, empty) {
  33. * console.log('Start:', name, attrs, empty);
  34. * },
  35. *
  36. * end: function(name) {
  37. * console.log('End:', name);
  38. * },
  39. *
  40. * pi: function(name, text) {
  41. * console.log('PI:', name, text);
  42. * },
  43. *
  44. * doctype: function(text) {
  45. * console.log('DocType:', text);
  46. * }
  47. * }, schema);
  48. * @class tinymce.html.SaxParser
  49. * @version 3.4
  50. */
  51. define("tinymce/html/SaxParser", [
  52. "tinymce/html/Schema",
  53. "tinymce/html/Entities",
  54. "tinymce/util/Tools"
  55. ], function(Schema, Entities, Tools) {
  56. var each = Tools.each;
  57. /**
  58. * Constructs a new SaxParser instance.
  59. *
  60. * @constructor
  61. * @method SaxParser
  62. * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks.
  63. * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing.
  64. */
  65. return function(settings, schema) {
  66. var self = this;
  67. function noop() {}
  68. settings = settings || {};
  69. self.schema = schema = schema || new Schema();
  70. if (settings.fix_self_closing !== false) {
  71. settings.fix_self_closing = true;
  72. }
  73. // Add handler functions from settings and setup default handlers
  74. each('comment cdata text start end pi doctype'.split(' '), function(name) {
  75. if (name) {
  76. self[name] = settings[name] || noop;
  77. }
  78. });
  79. /**
  80. * Parses the specified HTML string and executes the callbacks for each item it finds.
  81. *
  82. * @example
  83. * new SaxParser({...}).parse('<b>text</b>');
  84. * @method parse
  85. * @param {String} html Html string to sax parse.
  86. */
  87. self.parse = function(html) {
  88. var self = this, matches, index = 0, value, endRegExp, stack = [], attrList, i, text, name;
  89. var isInternalElement, removeInternalElements, shortEndedElements, fillAttrsMap, isShortEnded;
  90. var validate, elementRule, isValidElement, attr, attribsValue, validAttributesMap, validAttributePatterns;
  91. var attributesRequired, attributesDefault, attributesForced;
  92. var anyAttributesRequired, selfClosing, tokenRegExp, attrRegExp, specialElements, attrValue, idCount = 0;
  93. var decode = Entities.decode, fixSelfClosing, filteredUrlAttrs = Tools.makeMap('src,href,data,background,formaction,poster');
  94. var scriptUriRegExp = /((java|vb)script|mhtml):/i, dataUriRegExp = /^data:/i;
  95. function processEndTag(name) {
  96. var pos, i;
  97. // Find position of parent of the same type
  98. pos = stack.length;
  99. while (pos--) {
  100. if (stack[pos].name === name) {
  101. break;
  102. }
  103. }
  104. // Found parent
  105. if (pos >= 0) {
  106. // Close all the open elements
  107. for (i = stack.length - 1; i >= pos; i--) {
  108. name = stack[i];
  109. if (name.valid) {
  110. self.end(name.name);
  111. }
  112. }
  113. // Remove the open elements from the stack
  114. stack.length = pos;
  115. }
  116. }
  117. function parseAttribute(match, name, value, val2, val3) {
  118. var attrRule, i, trimRegExp = /[\s\u0000-\u001F]+/g;
  119. name = name.toLowerCase();
  120. value = name in fillAttrsMap ? name : decode(value || val2 || val3 || ''); // Handle boolean attribute than value attribute
  121. // Validate name and value pass through all data- attributes
  122. if (validate && !isInternalElement && name.indexOf('data-') !== 0) {
  123. attrRule = validAttributesMap[name];
  124. // Find rule by pattern matching
  125. if (!attrRule && validAttributePatterns) {
  126. i = validAttributePatterns.length;
  127. while (i--) {
  128. attrRule = validAttributePatterns[i];
  129. if (attrRule.pattern.test(name)) {
  130. break;
  131. }
  132. }
  133. // No rule matched
  134. if (i === -1) {
  135. attrRule = null;
  136. }
  137. }
  138. // No attribute rule found
  139. if (!attrRule) {
  140. return;
  141. }
  142. // Validate value
  143. if (attrRule.validValues && !(value in attrRule.validValues)) {
  144. return;
  145. }
  146. }
  147. // Block any javascript: urls or non image data uris
  148. if (filteredUrlAttrs[name] && !settings.allow_script_urls) {
  149. var uri = value.replace(trimRegExp, '');
  150. try {
  151. // Might throw malformed URI sequence
  152. uri = decodeURIComponent(uri);
  153. } catch (ex) {
  154. // Fallback to non UTF-8 decoder
  155. uri = unescape(uri);
  156. }
  157. if (scriptUriRegExp.test(uri)) {
  158. return;
  159. }
  160. if (!settings.allow_html_data_urls && dataUriRegExp.test(uri) && !/^data:image\//i.test(uri)) {
  161. return;
  162. }
  163. }
  164. // Add attribute to list and map
  165. attrList.map[name] = value;
  166. attrList.push({
  167. name: name,
  168. value: value
  169. });
  170. }
  171. // Precompile RegExps and map objects
  172. tokenRegExp = new RegExp('<(?:' +
  173. '(?:!--([\\w\\W]*?)-->)|' + // Comment
  174. '(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)|' + // CDATA
  175. '(?:!DOCTYPE([\\w\\W]*?)>)|' + // DOCTYPE
  176. '(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)|' + // PI
  177. '(?:\\/([^>]+)>)|' + // End element
  178. '(?:([A-Za-z0-9\\-\\:\\.]+)((?:\\s+[^"\'>]+(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>]*))*|\\/|\\s+)>)' + // Start element
  179. ')', 'g');
  180. attrRegExp = /([\w:\-]+)(?:\s*=\s*(?:(?:\"((?:[^\"])*)\")|(?:\'((?:[^\'])*)\')|([^>\s]+)))?/g;
  181. // Setup lookup tables for empty elements and boolean attributes
  182. shortEndedElements = schema.getShortEndedElements();
  183. selfClosing = settings.self_closing_elements || schema.getSelfClosingElements();
  184. fillAttrsMap = schema.getBoolAttrs();
  185. validate = settings.validate;
  186. removeInternalElements = settings.remove_internals;
  187. fixSelfClosing = settings.fix_self_closing;
  188. specialElements = schema.getSpecialElements();
  189. while ((matches = tokenRegExp.exec(html))) {
  190. // Text
  191. if (index < matches.index) {
  192. self.text(decode(html.substr(index, matches.index - index)));
  193. }
  194. if ((value = matches[6])) { // End element
  195. value = value.toLowerCase();
  196. // IE will add a ":" in front of elements it doesn't understand like custom elements or HTML5 elements
  197. if (value.charAt(0) === ':') {
  198. value = value.substr(1);
  199. }
  200. processEndTag(value);
  201. } else if ((value = matches[7])) { // Start element
  202. value = value.toLowerCase();
  203. // IE will add a ":" in front of elements it doesn't understand like custom elements or HTML5 elements
  204. if (value.charAt(0) === ':') {
  205. value = value.substr(1);
  206. }
  207. isShortEnded = value in shortEndedElements;
  208. // Is self closing tag for example an <li> after an open <li>
  209. if (fixSelfClosing && selfClosing[value] && stack.length > 0 && stack[stack.length - 1].name === value) {
  210. processEndTag(value);
  211. }
  212. // Validate element
  213. if (!validate || (elementRule = schema.getElementRule(value))) {
  214. isValidElement = true;
  215. // Grab attributes map and patters when validation is enabled
  216. if (validate) {
  217. validAttributesMap = elementRule.attributes;
  218. validAttributePatterns = elementRule.attributePatterns;
  219. }
  220. // Parse attributes
  221. if ((attribsValue = matches[8])) {
  222. isInternalElement = attribsValue.indexOf('data-mce-type') !== -1; // Check if the element is an internal element
  223. // If the element has internal attributes then remove it if we are told to do so
  224. if (isInternalElement && removeInternalElements) {
  225. isValidElement = false;
  226. }
  227. attrList = [];
  228. attrList.map = {};
  229. attribsValue.replace(attrRegExp, parseAttribute);
  230. } else {
  231. attrList = [];
  232. attrList.map = {};
  233. }
  234. // Process attributes if validation is enabled
  235. if (validate && !isInternalElement) {
  236. attributesRequired = elementRule.attributesRequired;
  237. attributesDefault = elementRule.attributesDefault;
  238. attributesForced = elementRule.attributesForced;
  239. anyAttributesRequired = elementRule.removeEmptyAttrs;
  240. // Check if any attribute exists
  241. if (anyAttributesRequired && !attrList.length) {
  242. isValidElement = false;
  243. }
  244. // Handle forced attributes
  245. if (attributesForced) {
  246. i = attributesForced.length;
  247. while (i--) {
  248. attr = attributesForced[i];
  249. name = attr.name;
  250. attrValue = attr.value;
  251. if (attrValue === '{$uid}') {
  252. attrValue = 'mce_' + idCount++;
  253. }
  254. attrList.map[name] = attrValue;
  255. attrList.push({name: name, value: attrValue});
  256. }
  257. }
  258. // Handle default attributes
  259. if (attributesDefault) {
  260. i = attributesDefault.length;
  261. while (i--) {
  262. attr = attributesDefault[i];
  263. name = attr.name;
  264. if (!(name in attrList.map)) {
  265. attrValue = attr.value;
  266. if (attrValue === '{$uid}') {
  267. attrValue = 'mce_' + idCount++;
  268. }
  269. attrList.map[name] = attrValue;
  270. attrList.push({name: name, value: attrValue});
  271. }
  272. }
  273. }
  274. // Handle required attributes
  275. if (attributesRequired) {
  276. i = attributesRequired.length;
  277. while (i--) {
  278. if (attributesRequired[i] in attrList.map) {
  279. break;
  280. }
  281. }
  282. // None of the required attributes where found
  283. if (i === -1) {
  284. isValidElement = false;
  285. }
  286. }
  287. // Invalidate element if it's marked as bogus
  288. if (attrList.map['data-mce-bogus']) {
  289. isValidElement = false;
  290. }
  291. }
  292. if (isValidElement) {
  293. self.start(value, attrList, isShortEnded);
  294. }
  295. } else {
  296. isValidElement = false;
  297. }
  298. // Treat script, noscript and style a bit different since they may include code that looks like elements
  299. if ((endRegExp = specialElements[value])) {
  300. endRegExp.lastIndex = index = matches.index + matches[0].length;
  301. if ((matches = endRegExp.exec(html))) {
  302. if (isValidElement) {
  303. text = html.substr(index, matches.index - index);
  304. }
  305. index = matches.index + matches[0].length;
  306. } else {
  307. text = html.substr(index);
  308. index = html.length;
  309. }
  310. if (isValidElement) {
  311. if (text.length > 0) {
  312. self.text(text, true);
  313. }
  314. self.end(value);
  315. }
  316. tokenRegExp.lastIndex = index;
  317. continue;
  318. }
  319. // Push value on to stack
  320. if (!isShortEnded) {
  321. if (!attribsValue || attribsValue.indexOf('/') != attribsValue.length - 1) {
  322. stack.push({name: value, valid: isValidElement});
  323. } else if (isValidElement) {
  324. self.end(value);
  325. }
  326. }
  327. } else if ((value = matches[1])) { // Comment
  328. // Padd comment value to avoid browsers from parsing invalid comments as HTML
  329. if (value.charAt(0) === '>') {
  330. value = ' ' + value;
  331. }
  332. if (!settings.allow_conditional_comments && value.substr(0, 3) === '[if') {
  333. value = ' ' + value;
  334. }
  335. self.comment(value);
  336. } else if ((value = matches[2])) { // CDATA
  337. self.cdata(value);
  338. } else if ((value = matches[3])) { // DOCTYPE
  339. self.doctype(value);
  340. } else if ((value = matches[4])) { // PI
  341. self.pi(value, matches[5]);
  342. }
  343. index = matches.index + matches[0].length;
  344. }
  345. // Text
  346. if (index < html.length) {
  347. self.text(decode(html.substr(index)));
  348. }
  349. // Close any open elements
  350. for (i = stack.length - 1; i >= 0; i--) {
  351. value = stack[i];
  352. if (value.valid) {
  353. self.end(value.name);
  354. }
  355. }
  356. };
  357. };
  358. });