1 /**
  2  * clean html pasted from word. modified from ckeditor.
  3  * @author yiminghe@gmail.com
  4  */
  5 KISSY.add("editor/core/dynamic/wordFilter", function (S, KEStyle, HtmlParser) {
  6     var $ = S.all,
  7         UA = S.UA,
  8         dtd = HtmlParser.DTD,
  9         wordFilter = new HtmlParser.Filter(),
 10         cssLengthRelativeUnit = /^([.\d]*)+(em|ex|px|gd|rem|vw|vh|vm|ch|mm|cm|in|pt|pc|deg|rad|ms|s|hz|khz){1}?/i,
 11     // e.g. 0px 0pt 0px
 12         emptyMarginRegex = /^(?:\b0[^\s]*\s*){1,4}$/,
 13         romanLiteralPattern = '^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$',
 14         lowerRomanLiteralRegex = new RegExp(romanLiteralPattern),
 15         upperRomanLiteralRegex = new RegExp(romanLiteralPattern.toUpperCase()),
 16         orderedPatterns = {
 17             'decimal':/\d+/,
 18             'lower-roman':lowerRomanLiteralRegex,
 19             'upper-roman':upperRomanLiteralRegex,
 20             'lower-alpha':/^[a-z]+$/,
 21             'upper-alpha':/^[A-Z]+$/
 22         },
 23         unorderedPatterns = {
 24             'disc':/[l\u00B7\u2002]/,
 25             'circle':/[\u006F\u00D8]/,
 26             'square':/[\u006E\u25C6]/
 27         },
 28         listMarkerPatterns = {
 29             'ol':orderedPatterns,
 30             'ul':unorderedPatterns
 31         },
 32         romans = [
 33             [1000, 'M'],
 34             [900, 'CM'],
 35             [500, 'D'],
 36             [400, 'CD'],
 37             [100, 'C'],
 38             [90, 'XC'],
 39             [50, 'L'],
 40             [40, 'XL'],
 41             [10, 'X'],
 42             [9, 'IX'],
 43             [5, 'V'],
 44             [4, 'IV'],
 45             [1, 'I']
 46         ],
 47         alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 48 
 49     // Convert roman numbering back to decimal.
 50     function fromRoman(str) {
 51         str = str.toUpperCase();
 52         var l = romans.length, retVal = 0;
 53         for (var i = 0; i < l; ++i) {
 54             for (var j = romans[i], k = j[1].length; str.substr(0, k) == j[1]; str = str.substr(k))
 55                 retVal += j[ 0 ];
 56         }
 57         return retVal;
 58     }
 59 
 60     // Convert alphabet numbering back to decimal.
 61     function fromAlphabet(str) {
 62         str = str.toUpperCase();
 63         var l = alphabets.length, retVal = 1;
 64         for (var x = 1; str.length > 0; x *= l) {
 65             retVal += alphabets.indexOf(str.charAt(str.length - 1)) * x;
 66             str = str.substr(0, str.length - 1);
 67         }
 68         return retVal;
 69     }
 70 
 71     function setStyle(element, str) {
 72         if (str) {
 73             element.setAttribute("style", str);
 74         } else {
 75             element.removeAttribute("style");
 76         }
 77     }
 78 
 79     /**
 80      * Convert the specified CSS length value to the calculated pixel length inside this page.
 81      * <strong>Note:</strong> Percentage based value is left intact.
 82      * @param {String} cssLength CSS length value.
 83      */
 84     var convertToPx = (function () {
 85         var calculator;
 86 
 87         return function (cssLength) {
 88             if (!calculator) {
 89                 calculator = $(
 90                     '<div style="position:absolute;left:-9999px;' +
 91                         'top:-9999px;margin:0px;padding:0px;border:0px;"' +
 92                         '></div>')['prependTo']("body");
 93 
 94             }
 95 
 96             if (!(/%$/).test(cssLength)) {
 97                 calculator.css('width', cssLength);
 98                 return calculator[0].clientWidth;
 99             }
100 
101             return cssLength;
102         };
103     })();
104 
105     var listBaseIndent = 0,
106         previousListItemMargin = null,
107         previousListId;
108 
109     function onlyChild(elem) {
110         var childNodes = elem.childNodes || [],
111             count = childNodes.length,
112             firstChild = (count == 1) && childNodes[0];
113         return firstChild || null;
114     }
115 
116     function removeAnyChildWithName(elem, tagName) {
117         var children = elem.childNodes || [],
118             ret = [],
119             child;
120 
121         for (var i = 0; i < children.length; i++) {
122             child = children[ i ];
123             if (!child.nodeName) {
124                 continue;
125             }
126             if (child.nodeName == tagName) {
127                 ret.push(child);
128                 children.splice(i--, 1);
129             }
130             ret = ret.concat(removeAnyChildWithName(child, tagName));
131         }
132         return ret;
133     }
134 
135     function getAncestor(elem, tagNameRegex) {
136         var parent = elem.parentNode;
137         while (parent && !( parent.nodeName && parent.nodeName.match(tagNameRegex) )) {
138             parent = parent.parentNode;
139         }
140         return parent;
141     }
142 
143     function firstChild(elem, evaluator) {
144         var child,
145             i,
146             children = elem.childNodes || [];
147 
148         for (i = 0; i < children.length; i++) {
149             child = children[ i ];
150             if (evaluator(child)) {
151                 return child;
152             } else if (child.nodeName) {
153                 child = firstChild(child, evaluator);
154                 if (child) {
155                     return child;
156                 }
157             }
158         }
159 
160         return null;
161     }
162 
163 
164     function addStyle(elem, name, value, isPrepend) {
165         var styleText, addingStyleText = '';
166         // name/value pair.
167         if (typeof value == 'string') {
168             addingStyleText += name + ':' + value + ';';
169         } else {
170             // style literal.
171             if (typeof name == 'object') {
172                 for (var style in name) {
173                     if (name.hasOwnProperty(style)) {
174                         addingStyleText += style + ':' + name[ style ] + ';';
175                     }
176                 }
177             }
178             // raw style text form.
179             else {
180                 addingStyleText += name;
181             }
182             isPrepend = value;
183         }
184 
185 
186         styleText = elem.getAttribute("style");
187 
188         styleText = ( isPrepend ?
189             [ addingStyleText, styleText ]
190             : [ styleText, addingStyleText ] ).join(';');
191 
192         setStyle(elem, styleText.replace(/^;|;(?=;)/, ''));
193     }
194 
195 
196     function parentOf(tagName) {
197         var result = {},
198             tag;
199         for (tag in dtd) {
200             if (tag.indexOf('$') == -1 && dtd[ tag ][ tagName ]) {
201                 result[ tag ] = 1;
202             }
203         }
204         return result;
205     }
206 
207     var filters =
208     {
209         // Transform a normal list into flat list items only presentation.
210         // E.g. <ul><li>level1<ol><li>level2</li></ol></li> =>
211         // <ke:li ke:listtype="ul" ke:indent="1">level1</ke:li>
212         // <ke:li ke:listtype="ol" ke:indent="2">level2</ke:li>
213         flattenList:function (element, level) {
214             level = typeof level == 'number' ? level : 1;
215 
216             var listStyleType;
217 
218             // All list items are of the same type.
219             switch (element.getAttribute("type")) {
220                 case 'a' :
221                     listStyleType = 'lower-alpha';
222                     break;
223                 case '1' :
224                     listStyleType = 'decimal';
225                     break;
226                 // TODO: Support more list style type from MS-Word.
227             }
228 
229             var children = element.childNodes || [],
230                 child;
231 
232             for (var i = 0; i < children.length; i++) {
233                 child = children[ i ];
234 
235                 if (child.nodeName in dtd.$listItem) {
236                     var listItemChildren = child.childNodes || [],
237                         count = listItemChildren.length,
238                         last = listItemChildren[ count - 1 ];
239 
240                     // Move out nested list.
241                     if (last.nodeName in dtd.$list) {
242                         element.insertAfter(child);
243                         // Remove the parent list item if it's just a holder.
244                         if (!--listItemChildren.length) {
245                             element.removeChild(children[i--]);
246                         }
247                     }
248 
249                     child.setTagName('ke:li');
250 
251                     // Inherit numbering from list root on the first list item.
252                     element.getAttribute("start") &&
253                         !i &&
254                     ( element.setAttribute("value", element.getAttribute("start")));
255 
256                     filters.stylesFilter(
257                         [
258                             [
259                                 'tab-stops', null, function (val) {
260                                 var margin = val.split(' ')[ 1 ].match(cssLengthRelativeUnit);
261                                 margin && ( previousListItemMargin = convertToPx(margin[ 0 ]) );
262                             }
263                             ],
264                             ( level == 1 ? [ 'mso-list', null, function (val) {
265                                 val = val.split(' ');
266                                 var listId = Number(val[ 0 ].match(/\d+/));
267                                 if (listId !== previousListId) {
268                                     child.setAttribute('ke:reset', 1)
269                                 }
270                                 previousListId = listId;
271                             } ] : null )
272                         ])(child.getAttribute("style"));
273 
274                     child.setAttribute('ke:indent', level);
275                     child.setAttribute('ke:listtype', element.nodeName);
276                     child.setAttribute('ke:list-style-type', listStyleType);
277                 }
278                 // Flatten sub list.
279                 else if (child.nodeName in dtd.$list) {
280                     // Absorb sub list children.
281                     arguments.callee.apply(this, [ child, level + 1 ]);
282                     children = children.slice(0, i).concat(child.childNodes).concat(children.slice(i + 1));
283                     element.empty();
284                     for (var j = 0, num = children.length; j < num; j++)
285                         element.appendChild(children[j]);
286                 }
287             }
288 
289             element.nodeName = element.tagName = null;
290 
291             // We're loosing tag name here, signalize this element as a list.
292             element.setAttribute('ke:list', 1);
293         },
294 
295         /**
296          *  Try to collect all list items among the children and establish one
297          *  or more HTML list structures for them.
298          * @param element
299          */
300         assembleList:function (element) {
301             var children = element.childNodes || [],
302                 child,
303                 listItem, // The current processing ke:li element.
304                 listItemIndent, // Indent level of current list item.
305                 lastIndent,
306                 lastListItem, // The previous one just been added to the list.
307                 list, // Current staging list and it's parent list if any.
308                 openedLists = [],
309                 previousListStyleType,
310                 previousListType;
311 
312             // Properties of the list item are to be resolved from the list bullet.
313             var bullet,
314                 listType,
315                 listStyleType,
316                 itemNumeric;
317 
318             for (var i = 0; i < children.length; i++) {
319                 child = children[ i ];
320 
321                 if ('ke:li' == child.nodeName) {
322                     child.setTagName('li');
323                     listItem = child;
324 
325                     bullet = listItem.getAttribute('ke:listsymbol');
326                     bullet = bullet && bullet.match(/^(?:[(]?)([^\s]+?)([.)]?)$/);
327                     listType = listStyleType = itemNumeric = null;
328 
329                     if (listItem.getAttribute('ke:ignored')) {
330                         children.splice(i--, 1);
331                         continue;
332                     }
333 
334 
335                     // This's from a new list root.
336                     listItem.getAttribute('ke:reset') && ( list = lastIndent = lastListItem = null );
337 
338                     // List item indent level might come from a real list indentation or
339                     // been resolved from a pseudo list item's margin value, even get
340                     // no indentation at all.
341                     listItemIndent = Number(listItem.getAttribute('ke:indent'));
342 
343                     // We're moving out of the current list, cleaning up.
344                     if (listItemIndent != lastIndent)
345                         previousListType = previousListStyleType = null;
346 
347                     // List type and item style are already resolved.
348                     if (!bullet) {
349                         listType = listItem.getAttribute('ke:listtype') || 'ol';
350                         listStyleType = listItem.getAttribute('ke:list-style-type');
351                     }
352                     else {
353                         // Probably share the same list style type with previous list item,
354                         // give it priority to avoid ambiguous between C(Alpha) and C.(Roman).
355                         if (previousListType &&
356                             listMarkerPatterns[ previousListType ] [ previousListStyleType ].test(bullet[ 1 ])) {
357                             listType = previousListType;
358                             listStyleType = previousListStyleType;
359                         }
360                         else {
361                             for (var type in listMarkerPatterns) {
362                                 for (var style in listMarkerPatterns[ type ]) {
363                                     if (listMarkerPatterns[ type ][ style ].test(bullet[ 1 ])) {
364                                         // Small numbering has higher priority, when dealing with ambiguous
365                                         // between C(Alpha) and C.(Roman).
366                                         if (type == 'ol' && ( /alpha|roman/ ).test(style)) {
367                                             var num = /roman/.test(style) ? fromRoman(bullet[ 1 ]) : fromAlphabet(bullet[ 1 ]);
368                                             if (!itemNumeric || num < itemNumeric) {
369                                                 itemNumeric = num;
370                                                 listType = type;
371                                                 listStyleType = style;
372                                             }
373                                         }
374                                         else {
375                                             listType = type;
376                                             listStyleType = style;
377                                             break;
378                                         }
379                                     }
380                                 }
381                             }
382                         }
383 
384                         // Simply use decimal/disc for the rest forms of unrepresentable
385                         // numerals, e.g. Chinese..., but as long as there a second part
386                         // included, it has a bigger chance of being a order list ;)
387                         !listType && ( listType = bullet[ 2 ] ? 'ol' : 'ul' );
388                     }
389 
390                     previousListType = listType;
391                     previousListStyleType = listStyleType || ( listType == 'ol' ? 'decimal' : 'disc' );
392                     if (listStyleType && listStyleType != ( listType == 'ol' ? 'decimal' : 'disc' ))
393                         addStyle(listItem, 'list-style-type', listStyleType);
394 
395                     // Figure out start numbering.
396                     if (listType == 'ol' && bullet) {
397                         switch (listStyleType) {
398                             case 'decimal' :
399                                 itemNumeric = Number(bullet[ 1 ]);
400                                 break;
401                             case 'lower-roman':
402                             case 'upper-roman':
403                                 itemNumeric = fromRoman(bullet[ 1 ]);
404                                 break;
405                             case 'lower-alpha':
406                             case 'upper-alpha':
407                                 itemNumeric = fromAlphabet(bullet[ 1 ]);
408                                 break;
409                         }
410 
411                         // Always create the numbering, swipe out unnecessary ones later.
412                         listItem.setAttribute("value", itemNumeric);
413                     }
414 
415                     // Start the list construction.
416                     if (!list) {
417                         openedLists.push(list = new HtmlParser.Tag(listType));
418                         list.appendChild(listItem);
419                         element.replaceChild(list, children[i]);
420                     } else {
421                         if (listItemIndent > lastIndent) {
422                             openedLists.push(list = new HtmlParser.Tag(listType));
423                             list.appendChild(listItem);
424                             lastListItem.appendChild(list);
425                         }
426                         else if (listItemIndent < lastIndent) {
427                             // There might be a negative gap between two list levels. (#4944)
428                             var diff = lastIndent - listItemIndent,
429                                 parent;
430                             while (diff-- && ( parent = list.parentNode )) {
431                                 list = parent.parentNode;
432                             }
433                             list.appendChild(listItem);
434                         }
435                         else {
436                             list.appendChild(listItem);
437                         }
438                         children.splice(i--, 1);
439                     }
440 
441                     lastListItem = listItem;
442                     lastIndent = listItemIndent;
443                 }
444                 else if (child.nodeType == 3 && !S.trim(child.nodeValue)) {
445                     //  li 间的空文字节点忽略
446                 } else if (list) {
447                     list = lastIndent = lastListItem = null;
448                 }
449             }
450 
451             for (i = 0; i < openedLists.length; i++) {
452                 postProcessList(openedLists[ i ]);
453             }
454         },
455 
456         /**
457          * A simple filter which always rejecting.
458          */
459         falsyFilter:function () {
460             return false;
461         },
462 
463         /**
464          * A filter dedicated on the 'style' attribute filtering, e.g. dropping/replacing style properties.
465          * @param styles {Array} in form of [ styleNameRegexp, styleValueRegexp,
466          *  newStyleValue/newStyleGenerator, newStyleName ] where only the first
467          *  parameter is mandatory.
468          * @param [whitelist] {Boolean} Whether the {@param styles} will be considered as a white-list.
469          */
470         stylesFilter:function (styles, whitelist) {
471             return function (styleText, element) {
472                 var rules = [];
473                 // html-encoded quote might be introduced by 'font-family'
474                 // from MS-Word which confused the following regexp. e.g.
475                 //'font-family: "Lucida, Console"'
476                 ( styleText || '' )
477                     .replace(/"/g, '"')
478                     .replace(/\s*([^ :;]+)\s*:\s*([^;]+)\s*(?=;|$)/g,
479                     function (match, name, value) {
480                         name = name.toLowerCase();
481                         name == 'font-family' && ( value = value.replace(/["']/g, '') );
482 
483                         var namePattern,
484                             valuePattern,
485                             newValue,
486                             newName;
487                         for (var i = 0; i < styles.length; i++) {
488                             if (styles[ i ]) {
489                                 namePattern = styles[ i ][ 0 ];
490                                 valuePattern = styles[ i ][ 1 ];
491                                 newValue = styles[ i ][ 2 ];
492                                 newName = styles[ i ][ 3 ];
493 
494                                 if (name.match(namePattern)
495                                     && ( !valuePattern || value.match(valuePattern) )) {
496                                     name = newName || name;
497                                     whitelist && ( newValue = newValue || value );
498 
499                                     if (typeof newValue == 'function') {
500                                         newValue = newValue(value, element, name);
501                                     }
502 
503                                     // Return an couple indicate both name and value
504                                     // changed.
505                                     if (newValue && newValue.push) {
506                                         name = newValue[ 0 ];
507                                         newValue = newValue[ 1 ];
508                                     }
509 
510                                     if (typeof newValue == 'string') {
511                                         rules.push([ name, newValue ]);
512                                     }
513 
514                                     return;
515                                 }
516                             }
517                         }
518 
519                         !whitelist && rules.push([ name, value ]);
520 
521                     });
522 
523                 for (var i = 0; i < rules.length; i++) {
524                     rules[ i ] = rules[ i ].join(':');
525                 }
526 
527                 return rules.length ? ( rules.join(';') + ';' ) : false;
528             };
529         },
530 
531         /**
532          * A filter which will be used to apply inline css style according the stylesheet
533          * definition rules, is generated lazily when filtering.
534          */
535         applyStyleFilter:null
536 
537     };
538 
539 
540     // 1. move consistent list item styles up to list root.
541     // 2. clear out unnecessary list item numbering.
542     function postProcessList(list) {
543         var children = list.childNodes || [],
544             child,
545             count = children.length,
546             match,
547             mergeStyle,
548             styleTypeRegexp = /list-style-type:(.*?)(?:;|$)/,
549             stylesFilter = filters.stylesFilter;
550 
551 
552         if (styleTypeRegexp.exec(list.getAttribute("style")))
553             return;
554 
555         for (var i = 0; i < count; i++) {
556             child = children[ i ];
557 
558             if (child.getAttribute("value") && Number(child.getAttribute("value")) == i + 1) {
559                 child.removeAttribute("value");
560             }
561 
562             match = styleTypeRegexp.exec(child.getAttribute("style"));
563 
564             if (match) {
565                 if (match[ 1 ] == mergeStyle || !mergeStyle)
566                     mergeStyle = match[ 1 ];
567                 else {
568                     mergeStyle = null;
569                     break;
570                 }
571             }
572         }
573 
574         if (mergeStyle) {
575             for (i = 0; i < count; i++) {
576                 var style = children[ i ].getAttribute("style");
577 
578                 if (style) {
579                     style = stylesFilter([
580                         [ 'list-style-type']
581                     ])(style);
582                     setStyle(children[ i ], style);
583                 }
584             }
585             addStyle(list, 'list-style-type', mergeStyle);
586         }
587     }
588 
589     var utils = {
590         // Create a <ke:listbullet> which indicate an list item type.
591         createListBulletMarker:function (bullet, bulletText) {
592             var marker = new HtmlParser.Tag('ke:listbullet');
593             marker.setAttribute("ke:listsymbol", bullet[ 0 ]);
594             marker.appendChild(new HtmlParser.Text(bulletText));
595             return marker;
596         },
597 
598         isListBulletIndicator:function (element) {
599             var styleText = element.getAttribute("style");
600             if (/mso-list\s*:\s*Ignore/i.test(styleText)) {
601                 return true;
602             }
603         },
604 
605         isContainingOnlySpaces:function (element) {
606             var text;
607             return ( ( text = onlyChild(element) )
608                 && ( /^(:?\s| )+$/ ).test(text.nodeValue) );
609         },
610 
611         resolveList:function (element) {
612             // <ke:listbullet> indicate a list item.
613             var listMarker;
614 
615             if (( listMarker = removeAnyChildWithName(element, 'ke:listbullet') )
616                 && listMarker.length
617                 && ( listMarker = listMarker[ 0 ] )) {
618                 element.setTagName('ke:li');
619 
620                 if (element.getAttribute("style")) {
621                     var styleStr = filters.stylesFilter(
622                         [
623                             // Text-indent is not representing list item level any more.
624                             [ 'text-indent' ],
625                             [ 'line-height' ],
626                             // First attempt is to resolve indent level from on a constant margin increment.
627                             [ ( /^margin(:?-left)?$/ ), null, function (margin) {
628                                 // Deal with component/short-hand form.
629                                 var values = margin.split(' ');
630                                 margin = convertToPx(values[ 3 ] || values[ 1 ] || values [ 0 ]);
631 
632                                 // Figure out the indent unit by checking the first time of incrementation.
633                                 if (!listBaseIndent && previousListItemMargin !== null &&
634                                     margin > previousListItemMargin) {
635                                     listBaseIndent = margin - previousListItemMargin;
636                                 }
637 
638                                 previousListItemMargin = margin;
639                                 if (listBaseIndent) {
640                                     element.setAttribute('ke:indent', listBaseIndent &&
641                                         ( Math.ceil(margin / listBaseIndent) + 1 ) || 1);
642                                 }
643                             } ],
644                             // The best situation: "mso-list:l0 level1 lfo2" tells the belonged list root, list item indentation, etc.
645                             [ ( /^mso-list$/ ), null, function (val) {
646                                 val = val.split(' ');
647                                 var listId = Number(val[ 0 ].match(/\d+/)),
648                                     indent = Number(val[ 1 ].match(/\d+/));
649 
650                                 if (indent == 1) {
651                                     listId !== previousListId && ( element.setAttribute('ke:reset', 1) );
652 
653                                     previousListId = listId;
654                                 }
655                                 element.setAttribute('ke:indent', indent);
656                             } ]
657                         ])(element.getAttribute("style"), element);
658 
659                     setStyle(element, styleStr);
660                 }
661 
662                 // First level list item might be presented without a margin.
663                 // In case all above doesn't apply.
664                 if (!element.getAttribute("ke:indent")) {
665                     previousListItemMargin = 0;
666                     element.setAttribute('ke:indent', 1);
667                 }
668 
669                 S.each(listMarker.attributes, function (a) {
670                     element.setAttribute(a.name, a.value);
671                 });
672 
673                 return true;
674             }
675             // Current list disconnected.
676             else {
677                 previousListId = previousListItemMargin = listBaseIndent = null;
678             }
679             return false;
680         },
681 
682         // Providing a shorthand style then retrieve one or more style component values.
683         getStyleComponents:(function () {
684             var calculator = $(
685                 '<div style="position:absolute;left:-9999px;top:-9999px;"></div>').prependTo("body");
686 
687             return function (name, styleValue, fetchList) {
688                 calculator.css(name, styleValue);
689                 var styles = {},
690                     count = fetchList.length;
691                 for (var i = 0; i < count; i++)
692                     styles[ fetchList[ i ] ] = calculator.css(fetchList[ i ]);
693 
694                 return styles;
695             };
696         })(),
697 
698         listDtdParents:parentOf('ol')
699     };
700 
701     (function () {
702         var blockLike = S.merge(dtd.$block, dtd.$listItem, dtd.$tableContent),
703             falsyFilter = filters.falsyFilter,
704             stylesFilter = filters.stylesFilter,
705             createListBulletMarker = utils.createListBulletMarker,
706             flattenList = filters.flattenList,
707             assembleList = filters.assembleList,
708             isListBulletIndicator = utils.isListBulletIndicator,
709             containsNothingButSpaces = utils.isContainingOnlySpaces,
710             resolveListItem = utils.resolveList,
711             convertToPx = function (value) {
712                 value = convertToPx(value);
713                 return isNaN(value) ? value : value + 'px';
714             },
715             getStyleComponents = utils.getStyleComponents,
716             listDtdParents = utils.listDtdParents;
717 
718         wordFilter.addRules({
719 
720             tagNames:[
721                 // Remove script, meta and link elements.
722                 [ ( /meta|link|script/ ), '' ]
723             ],
724 
725             root:function (element) {
726                 element.filterChildren();
727                 assembleList(element);
728             },
729 
730             tags:{
731                 '^':function (element) {
732                     // Transform CSS style declaration to inline style.
733                     var applyStyleFilter;
734                     if (UA.gecko && ( applyStyleFilter = filters.applyStyleFilter ))
735                         applyStyleFilter(element);
736                 },
737 
738                 $:function (element) {
739                     var tagName = element.nodeName || ''
740 
741                     // Convert length unit of width/height on blocks to
742                     // a more editor-friendly way (px).
743                     if (tagName in blockLike && element.getAttribute("style")) {
744                         setStyle(element, stylesFilter(
745                             [
746                                 [ ( /^(:?width|height)$/ ), null, convertToPx ]
747                             ])(element.getAttribute("style")));
748                     }
749 
750                     // Processing headings.
751                     if (tagName.match(/h\d/)) {
752                         element.filterChildren();
753                         // Is the heading actually a list item?
754                         if (resolveListItem(element)) {
755                             return;
756                         }
757                     }
758                     // Remove inline elements which contain only empty spaces.
759                     else if (tagName in dtd.$inline) {
760                         element.filterChildren();
761                         if (containsNothingButSpaces(element)) {
762                             element.setTagName(null);
763                         }
764                     }
765                     // Remove element with ms-office namespace,
766                     // with it's content preserved, e.g. 'o:p'.
767                     else if (tagName.indexOf(':') != -1
768                         && tagName.indexOf('ke') == -1) {
769                         element.filterChildren();
770 
771                         // Restore image real link from vml.
772                         if (tagName == 'v:imagedata') {
773                             var href = element.getAttribute('o:href');
774                             if (href) {
775                                 element.setAttribute("src", href);
776                             }
777                             element.setTagName('img');
778                             return;
779                         }
780                         element.setTagName(null);
781                     }
782 
783                     // Assembling list items into a whole list.
784                     if (tagName in listDtdParents) {
785                         element.filterChildren();
786                         assembleList(element);
787                     }
788                 },
789 
790                 // We'll drop any style sheet, but Firefox conclude
791                 // certain styles in a single style element, which are
792                 // required to be changed into inline ones.
793                 'style':function (element) {
794                     if (UA.gecko) {
795                         // Grab only the style definition section.
796                         var styleDefSection = onlyChild(element).nodeValue
797                                 .match(/\/\* Style Definitions \*\/([\s\S]*?)\/\*/),
798                             styleDefText = styleDefSection && styleDefSection[ 1 ],
799                             rules = {}; // Storing the parsed result.
800 
801                         if (styleDefText) {
802                             styleDefText
803                                 // Remove line-breaks.
804                                 .replace(/[\n\r]/g, '')
805                                 // Extract selectors and style properties.
806                                 .replace(/(.+?)\{(.+?)\}/g,
807                                 function (rule, selectors, styleBlock) {
808                                     selectors = selectors.split(',');
809                                     var length = selectors.length;
810                                     for (var i = 0; i < length; i++) {
811                                         // Assume MS-Word mostly generate only simple
812                                         // selector( [Type selector][Class selector]).
813                                         S.trim(selectors[ i ])
814                                             .replace(/^(\w+)(\.[\w-]+)?$/g,
815                                             function (match, tagName, className) {
816                                                 tagName = tagName || '*';
817                                                 className = className.substring(1, className.length);
818 
819                                                 // Reject MS-Word Normal styles.
820                                                 if (className.match(/MsoNormal/))
821                                                     return;
822 
823                                                 if (!rules[ tagName ]) {
824                                                     rules[ tagName ] = {};
825                                                 }
826                                                 if (className) {
827                                                     rules[ tagName ][ className ] = styleBlock;
828                                                 } else {
829                                                     rules[ tagName ] = styleBlock;
830                                                 }
831                                             });
832                                     }
833                                 });
834 
835                             filters.applyStyleFilter = function (element) {
836                                 var name = rules[ '*' ] ? '*' : element.nodeName,
837                                     className = element.getAttribute('class'),
838                                     style;
839                                 if (name in rules) {
840                                     style = rules[ name ];
841                                     if (typeof style == 'object')
842                                         style = style[ className ];
843                                     // Maintain style rules priorities.
844                                     style && addStyle(element, style, true);
845                                 }
846                             };
847                         }
848                     }
849                     return false;
850                 },
851 
852                 'p':function (element) {
853                     // This's a fall-back approach to recognize list item in FF3.6,
854                     // as it's not perfect as not all list style (e.g. "heading list") is shipped
855                     // with this pattern. (#6662)
856                     if (/MsoListParagraph/.exec(element.getAttribute('class'))) {
857                         var bulletText = firstChild(element, function (node) {
858                             return node.nodeType == 3 && !containsNothingButSpaces(node.parentNode);
859                         });
860                         var bullet = bulletText && bulletText.parentNode;
861                         !bullet.getAttribute("style") && ( bullet.setAttribute("style", 'mso-list: Ignore;'));
862                     }
863 
864                     element.filterChildren();
865                     // Is the paragraph actually a list item?
866                     resolveListItem(element)
867                 },
868 
869                 'div':function (element) {
870                     // Aligned table with no text surrounded is represented by a wrapper div, from which
871                     // table cells inherit as text-align styles, which is wrong.
872                     // Instead we use a clear-float div after the table to properly achieve the same layout.
873                     var singleChild = onlyChild(element);
874                     if (singleChild && singleChild.nodeName == 'table') {
875                         var attrs = element.attributes;
876 
877                         S.each(attrs, function (attr) {
878                             singleChild.setAttribute(attr.name, attr.value);
879                         });
880 
881                         if (element.getAttribute("style")) {
882                             addStyle(singleChild, element.getAttribute("style"));
883                         }
884 
885                         var clearFloatDiv = new HtmlParser.Tag('div');
886                         addStyle(clearFloatDiv, 'clear', 'both');
887                         element.appendChild(clearFloatDiv);
888                         element.setTagName(null);
889                     }
890                 },
891 
892                 'td':function (element) {
893                     // 'td' in 'thead' is actually <th>.
894                     if (getAncestor(element, 'thead'))
895                         element.setTagName('th');
896                 },
897 
898                 // MS-Word sometimes present list as a mixing of normal list
899                 // and pseudo-list, normalize the previous ones into pseudo form.
900                 'ol':flattenList,
901                 'ul':flattenList,
902                 'dl':flattenList,
903 
904                 'font':function (element) {
905                     // Drop the font tag if it comes from list bullet text.
906                     if (isListBulletIndicator(element.parentNode)) {
907                         element.setTagName(null);
908                         return;
909                     }
910 
911                     element.filterChildren();
912 
913                     var styleText = element.getAttribute("style"),
914                         parent = element.parentNode;
915 
916                     if ('font' == parent.name)     // Merge nested <font> tags.
917                     {
918                         S.each(element.attributes, function (attr) {
919                             parent.setAttribute(attr.name, attr.value);
920                         });
921                         styleText && addStyle(parent, styleText);
922                         element.setTagName(null);
923                     }
924                     // Convert the merged into a span with all attributes preserved.
925                     else {
926                         styleText = styleText || '';
927                         // IE's having those deprecated attributes, normalize them.
928                         if (element.getAttribute("color")) {
929                             element.getAttribute("color") != '#000000' && ( styleText += 'color:' + element.getAttribute("color") + ';' );
930                             element.removeAttribute("color");
931                         }
932                         if (element.getAttribute("face")) {
933                             styleText += 'font-family:' + element.getAttribute("face") + ';';
934                             element.removeAttribute("face");
935                         }
936                         var size = element.getAttribute("size");
937                         // TODO: Mapping size in ranges of xx-small,
938                         // x-small, small, medium, large, x-large, xx-large.
939                         if (size) {
940                             styleText += 'font-size:' +
941                                 (size > 3 ? 'large'
942                                     : ( size < 3 ? 'small' : 'medium' ) ) + ';';
943                             element.removeAttribute("size");
944                         }
945                         element.setTagName("span");
946                         addStyle(element, styleText);
947                     }
948                 },
949 
950                 'span':function (element) {
951                     // Remove the span if it comes from list bullet text.
952                     if (isListBulletIndicator(element.parentNode)) {
953                         return false;
954                     }
955                     element.filterChildren();
956                     if (containsNothingButSpaces(element)) {
957                         element.setTagName(null);
958                         return null;
959                     }
960 
961                     // List item bullet type is supposed to be indicated by
962                     // the text of a span with style 'mso-list : Ignore' or an image.
963                     if (isListBulletIndicator(element)) {
964                         var listSymbolNode = firstChild(element, function (node) {
965                             return node.nodeValue || node.nodeName == 'img';
966                         });
967 
968                         var listSymbol = listSymbolNode && ( listSymbolNode.nodeValue || 'l.' ),
969                             listType = listSymbol && listSymbol.match(/^(?:[(]?)([^\s]+?)([.)]?)$/);
970 
971                         if (listType) {
972                             var marker = createListBulletMarker(listType, listSymbol);
973                             // Some non-existed list items might be carried by an inconsequential list,
974                             // indicate by "mso-hide:all/display:none",
975                             // those are to be removed later, now mark it with "ke:ignored".
976                             var ancestor = getAncestor(element, 'span');
977                             if (ancestor && (/ mso-hide:\s*all|display:\s*none /).
978                                 test(ancestor.getAttribute("style"))) {
979                                 marker.setAttribute('ke:ignored', 1);
980                             }
981                             return marker;
982                         }
983                     }
984 
985                     // Update the src attribute of image element with href.
986                     var styleText = element.getAttribute("style");
987 
988                     // Assume MS-Word mostly carry font related styles on <span>,
989                     // adapting them to editor's convention.
990                     if (styleText) {
991 
992                         setStyle(element, stylesFilter(
993                             [
994                                 // Drop 'inline-height' style which make lines overlapping.
995                                 [ /^line-height$/ ],
996                                 [  /^font-family$/  ] ,
997                                 [  /^font-size$/  ] ,
998                                 [  /^color$/  ] ,
999                                 [  /^background-color$/  ]
1000                             ]
1001                         )(styleText, element));
1002                     }
1003                 },
1004                 // Editor doesn't support anchor with content currently (#3582),
1005                 // drop such anchors with content preserved.
1006                 'a':function (element) {
1007                     var href;
1008                     if (!(href = element.getAttribute("href")) && element.getAttribute("name")) {
1009                         element.setTagName(null);
1010                     } else if (UA.webkit && href && href.match(/file:\/\/\/[\S]+#/i)) {
1011                         element.setAttribute("href", href.replace(/file:\/\/\/[^#]+/i, ''));
1012                     }
1013                 },
1014                 'ke:listbullet':function (element) {
1015                     if (getAncestor(element, /h\d/)) {
1016                         element.setTagName(null);
1017                     }
1018                 }
1019             },
1020 
1021             attributeNames:[
1022                 // Remove onmouseover and onmouseout events (from MS Word comments effect)
1023                 [ ( /^onmouse(:?out|over)/ ), '' ],
1024                 // Onload on image element.
1025                 [ ( /^onload$/ ), '' ],
1026                 // Remove office and vml attribute from elements.
1027                 [ ( /(?:v|o):\w+/ ), '' ],
1028                 // Remove lang/language attributes.
1029                 [ ( /^lang/ ), '' ]
1030             ],
1031 
1032             attributes:{
1033                 'style':stylesFilter(
1034                     // Provide a white-list of styles that we preserve, those should
1035                     // be the ones that could later be altered with editor tools.
1036                     [
1037                         // Leave list-style-type
1038                         [ ( /^list-style-type$/ ) ],
1039 
1040                         // Preserve margin-left/right which used as default indent style in the editor.
1041                         [ ( /^margin$|^margin-(?!bottom|top)/ ), null, function (value, element, name) {
1042                             if (element.nodeName in { p:1, div:1 }) {
1043                                 var indentStyleName = 'margin-left';
1044 
1045                                 // Extract component value from 'margin' shorthand.
1046                                 if (name == 'margin') {
1047                                     value = getStyleComponents(name, value,
1048                                         [ indentStyleName ])[ indentStyleName ];
1049                                 } else if (name != indentStyleName) {
1050                                     return null;
1051                                 }
1052 
1053                                 if (value && !emptyMarginRegex.test(value)) {
1054                                     return [ indentStyleName, value ];
1055                                 }
1056                             }
1057 
1058                             return null;
1059                         } ],
1060 
1061                         // Preserve clear float style.
1062                         [ ( /^clear$/ ) ],
1063 
1064                         [ ( /^border.*|margin.*|vertical-align|float$/ ), null,
1065                             function (value, element) {
1066                                 if (element.nodeName == 'img')
1067                                     return value;
1068                             } ],
1069 
1070                         [ (/^width|height$/ ), null,
1071                             function (value, element) {
1072                                 if (element.nodeName in { table:1, td:1, th:1, img:1 })
1073                                     return value;
1074                             } ]
1075                     ], 1),
1076 
1077                 // Prefer width styles over 'width' attributes.
1078                 'width':function (value, element) {
1079                     if (element.nodeName in dtd.$tableContent)
1080                         return false;
1081                 },
1082                 // Prefer border styles over table 'border' attributes.
1083                 'border':function (value, element) {
1084                     if (element.nodeName in dtd.$tableContent)
1085                         return false;
1086                 },
1087 
1088                 // Only Firefox carry style sheet from MS-Word, which
1089                 // will be applied by us manually. For other browsers
1090                 // the css className is useless.
1091                 'class':falsyFilter,
1092 
1093                 // MS-Word always generate 'background-color' along with 'bgcolor',
1094                 // simply drop the deprecated attributes.
1095                 'bgcolor':falsyFilter,
1096 
1097                 // Deprecate 'valign' attribute in favor of 'vertical-align'.
1098                 'valign':function (value, element) {
1099                     addStyle(element, 'vertical-align', value);
1100                     return false;
1101                 }
1102             },
1103 
1104 
1105             // Fore none-IE, some useful data might be buried under these IE-conditional
1106             // comments where RegExp were the right approach to dig them out where usual approach
1107             // is transform it into a fake element node which hold the desired data.
1108             comment:UA.ie ?
1109                 function (value, node) {
1110                     var imageInfo = value.match(/<img.*?>/),
1111                         listInfo = value.match(/^\[if !supportLists\]([\s\S]*?)\[endif\]$/);
1112 
1113                     // Seek for list bullet indicator.
1114                     if (listInfo) {
1115                         // Bullet symbol could be either text or an image.
1116                         var listSymbol = listInfo[ 1 ] || ( imageInfo && 'l.' ),
1117                             listType = listSymbol && listSymbol.match(/>(?:[(]?)([^\s]+?)([.)]?)</);
1118                         return createListBulletMarker(listType, listSymbol);
1119                     }
1120 
1121                     // Reveal the <img> element in conditional comments for Firefox.
1122                     if (UA.gecko && imageInfo) {
1123                         var img = new HtmlParser.Parser(imageInfo[0]).parse().childNodes[ 0 ],
1124                             previousComment = node.previousSibling,
1125                         // Try to dig the real image link from vml markup from previous comment text.
1126                             imgSrcInfo = previousComment && previousComment.toHtml().match(/<v:imagedata[^>]*o:href=['"](.*?)['"]/),
1127                             imgSrc = imgSrcInfo && imgSrcInfo[ 1 ];
1128 
1129                         // Is there a real 'src' url to be used?
1130                         imgSrc && ( img.setAttribute("src", imgSrc) );
1131                         return img;
1132                     }
1133 
1134                     return false;
1135                 }
1136                 : falsyFilter
1137         });
1138     })();
1139 
1140     return {
1141 
1142         toDataFormat:function (html, editor) {
1143             // Firefox will be confused by those downlevel-revealed IE conditional
1144             // comments, fixing them first( convert it to upperlevel-revealed one ).
1145             // e.g. <![if !vml]>...<![endif]>
1146             //<!--[if !supportLists]-->
1147             // <span style=\"font-family: Wingdings;\" lang=\"EN-US\">
1148             // <span style=\"\">l<span style=\"font: 7pt "Times New Roman";\"> 
1149             // </span></span></span>
1150             // <!--[endif]-->
1151 
1152             //变成:
1153 
1154             //<!--[if !supportLists]
1155             // <span style=\"font-family: Wingdings;\" lang=\"EN-US\">
1156             // <span style=\"\">l<span style=\"font: 7pt "Times New Roman";\"> 
1157             // </span></span></span>
1158             // [endif]-->
1159             if (UA.gecko) {
1160                 html = html.replace(/(<!--\[if[^<]*?\])-->([\S\s]*?)<!--(\[endif\]-->)/gi,
1161                     '$1$2$3');
1162             }
1163 
1164             // 针对 word 一次
1165             html = editor.htmlDataProcessor.toDataFormat(html, wordFilter);
1166 
1167             // 普通的一次
1168             html = editor.htmlDataProcessor.toDataFormat(html);
1169 
1170             return html;
1171         }
1172 
1173     };
1174 
1175 
1176 }, {
1177     requires:['../styles', 'htmlparser']
1178 });