1 /**
  2  * Modified from ckeditor. Process malformed html for kissy editor.
  3  * @author yiminghe@gmail.com
  4  */
  5 /*
  6  Copyright (c) 2003-2010, CKSource - Frederico Knabben. All rights reserved.
  7  For licensing, see LICENSE.html or http://ckeditor.com/license
  8  */
  9 KISSY.add("editor/core/htmlDataProcessor", function (S, Editor) {
 10 
 11     return {
 12         init:function (editor) {
 13             var Node = S.Node,
 14                 UA = S.UA,
 15                 HtmlParser = S.require("htmlparser"),
 16                 htmlFilter = new HtmlParser.Filter(),
 17                 dataFilter = new HtmlParser.Filter();
 18 
 19             function filterSpan(element) {
 20                 if (element.getAttribute('class') == 'Apple-style-span'
 21                     || !(element.attributes.length)) {
 22                     element.setTagName(null);
 23                     return undefined;
 24                 }
 25                 if (!(element.childNodes.length) && !(element.attributes.length)) {
 26                     return false;
 27                 }
 28                 return undefined;
 29             }
 30 
 31             (function () {
 32 
 33                 function wrapAsComment(element) {
 34                     var html = HtmlParser.serialize(element);
 35                     return new HtmlParser.Comment(protectedSourceMarker +
 36                         encodeURIComponent(html).replace(/--/g,
 37                             "%2D%2D"));
 38                 }
 39 
 40                 // 过滤外边来的 html
 41                 var defaultDataFilterRules = {
 42                     tagNames:[
 43                         [/^\?xml.*$/i, ''],
 44                         [/^.*namespace.*$/i, '']
 45                     ],
 46                     attributeNames:[
 47                         // Event attributes (onXYZ) must not be directly set. They can become
 48                         // active in the editing area (IE|WebKit).
 49                         [/^on/, 'ke_on'],
 50                         [/^lang$/, '']
 51                     ],
 52                     tags:{
 53                         script:wrapAsComment,
 54                         noscript:wrapAsComment,
 55                         span:filterSpan
 56                     }
 57                 };
 58 
 59                 // 将编辑区生成 html 最终化
 60                 var defaultHtmlFilterRules = {
 61                     tagNames:[
 62                         // Remove the "ke:" namespace prefix.
 63                         [ ( /^ke:/ ), '' ],
 64                         // Ignore <?xml:namespace> tags.
 65                         [ ( /^\?xml:namespace$/ ), '' ]
 66                     ],
 67                     tags:{
 68                         $:function (element) {
 69                             var attributes = element.attributes;
 70 
 71                             if (attributes.length) {
 72                                 // 先把真正属性去掉,后面会把 _ke_saved 后缀去掉的!
 73                                 // Remove duplicated attributes - #3789.
 74                                 var attributeNames = [ 'name', 'href', 'src' ],
 75                                     savedAttributeName;
 76                                 for (var i = 0; i < attributeNames.length; i++) {
 77                                     savedAttributeName = '_ke_saved_' + attributeNames[ i ];
 78                                     if (element.getAttribute(savedAttributeName)) {
 79                                         element.removeAttribute(attributeNames[i]);
 80                                     }
 81                                 }
 82                             }
 83 
 84                             return element;
 85                         },
 86                         embed:function (element) {
 87                             var parent = element.parentNode;
 88                             // If the <embed> is child of a <object>, copy the width
 89                             // and height attributes from it.
 90                             if (parent && parent.nodeName == 'object') {
 91                                 var parentWidth = parent.getAttribute("width"),
 92                                     parentHeight = parent.getAttribute("height");
 93                                 if (parentWidth) {
 94                                     element.setAttribute("width", parentWidth);
 95                                 }
 96                                 if (parentHeight) {
 97                                     element.setAttribute("width", parentHeight);
 98                                 }
 99                             }
100                         },
101 
102                         // Remove empty link but not empty anchor.(#3829)
103                         a:function (element) {
104                             if (!(element.childNodes.length) && !(element.attributes.length)) {
105                                 return false;
106                             }
107                         },
108                         span:filterSpan
109                     },
110                     attributes:{
111                         // 清除空style
112                         style:function (v) {
113                             if (!S.trim(v)) {
114                                 return false;
115                             }
116                         }
117                     },
118                     attributeNames:[
119                         // 把保存的作为真正的属性,替换掉原来的
120                         // replace(/^_ke_saved_/,"")
121                         // _ke_saved_href -> href
122                         [ ( /^_ke_saved_/ ), '' ],
123                         [ ( /^ke_on/ ), 'on' ],
124                         [ ( /^_ke.*/ ), '' ],
125                         [ ( /^ke:.*$/ ), '' ],
126                         // kissy 相关
127                         [ ( /^_ks.*/ ), '' ]
128                     ],
129                     comment:function (contents) {
130                         // If this is a comment for protected source.
131                         if (contents.substr(0, protectedSourceMarker.length) == protectedSourceMarker) {
132                             contents = S.trim(decodeURIComponent(contents.substr(protectedSourceMarker.length)));
133                             return HtmlParser.parse(contents).childNodes[0];
134                         }
135                     }
136                 };
137                 if (UA['ie']) {
138                     // IE outputs style attribute in capital letters. We should convert
139                     // them back to lower case.
140                     // bug: style='background:url(www.G.cn)' =>  style='background:url(www.g.cn)'
141                     // 只对 propertyName 小写
142                     defaultHtmlFilterRules.attributes.style = function (value // , element
143                         ) {
144                         return value.replace(/(^|;)([^:]+)/g, function (match) {
145                             return match.toLowerCase();
146                         });
147                     };
148                 }
149 
150                 htmlFilter.addRules(defaultHtmlFilterRules);
151                 dataFilter.addRules(defaultDataFilterRules);
152             })();
153 
154 
155             /**
156              * 去除firefox代码末尾自动添加的 <br/>
157              * 以及ie下自动添加的  
158              * 以及其他浏览器段落末尾添加的占位符
159              */
160             (function () {
161                 // Regex to scan for   at the end of blocks, which are actually placeholders.
162                 // Safari transforms the   to \xa0. (#4172)
163                 var tailNbspRegex = /^[\t\r\n ]*(?: |\xa0)$/;
164 
165                 // Return the last non-space child node of the block (#4344).
166                 function lastNoneSpaceChild(block) {
167                     var childNodes = block.childNodes,
168                         lastIndex = childNodes.length,
169                         last = childNodes[ lastIndex - 1 ];
170                     while (last && last.nodeType == 3 && !S.trim(last.nodeValue))
171                         last = childNodes[ --lastIndex ];
172                     return last;
173                 }
174 
175                 function trimFillers(block, fromSource) {
176                     // If the current node is a block, and if we're converting from source or
177                     // we're not in IE then search for and remove any tailing BR node.
178                     // Also, any   at the end of blocks are fillers, remove them as well.
179                     // (#2886)
180                     var lastChild = lastNoneSpaceChild(block);
181                     if (lastChild) {
182                         if (( fromSource || !UA['ie'] ) &&
183                             lastChild.nodeType == 1 &&
184                             lastChild.nodeName == 'br') {
185                             block.removeChild(lastChild);
186                         }
187                         else if (lastChild.nodeType == 3 &&
188                             tailNbspRegex.test(lastChild.nodeValue)) {
189                             block.removeChild(lastChild);
190                         }
191                     }
192                 }
193 
194                 function blockNeedsExtension(block) {
195                     var lastChild = lastNoneSpaceChild(block);
196 
197                     return !lastChild
198                         || lastChild.nodeType == 1 &&
199                         lastChild.nodeName == 'br'
200                         // Some of the controls in form needs extension too,
201                         // to move cursor at the end of the form. (#4791)
202                         || block.nodeName == 'form' &&
203                         lastChild.nodeName == 'input';
204                 }
205 
206                 function extendBlockForDisplay(block) {
207                     trimFillers(block, true);
208 
209                     if (blockNeedsExtension(block)) {
210                         // 任何浏览器都要加空格!否则空表格可能间隙太小,不能容下光标
211                         if (UA['ie']) {
212                             block.appendChild(new HtmlParser.Text('\xa0'));
213                         } else {
214                             //其他浏览器需要加空格??
215                             block.appendChild(new HtmlParser.Text(' '));
216                             block.appendChild(new HtmlParser.Tag('br'));
217                         }
218                     }
219                 }
220 
221                 function extendBlockForOutput(block) {
222                     trimFillers(block, false);
223                     if (blockNeedsExtension(block)) {
224                         block.appendChild(new HtmlParser.Text('\xa0'));
225                     }
226                 }
227 
228                 // Find out the list of block-like tags that can contain <br>.
229                 var dtd = Editor.XHTML_DTD;
230                 var blockLikeTags = S.merge(
231                     dtd.$block,
232                     dtd.$listItem,
233                     dtd.$tableContent), i;
234                 for (i in blockLikeTags) {
235                     if (blockLikeTags.hasOwnProperty(i)) {
236                         if (!( 'br' in dtd[i] )) {
237                             delete blockLikeTags[i];
238                         }
239                     }
240                 }
241 
242                 // table 布局需要,不要自动往 td 中加东西
243                 delete blockLikeTags.td;
244 
245                 // We just avoid filler in <pre> right now.
246                 // TODO: Support filler for <pre>, line break is also occupy line height.
247                 delete blockLikeTags.pre;
248                 var defaultDataBlockFilterRules = { tags:{} };
249                 var defaultHtmlBlockFilterRules = { tags:{} };
250 
251                 for (i in blockLikeTags) {
252                     if (blockLikeTags.hasOwnProperty(i)) {
253                         defaultDataBlockFilterRules.tags[ i ] = extendBlockForDisplay;
254                         defaultHtmlBlockFilterRules.tags[ i ] = extendBlockForOutput;
255                     }
256                 }
257                 dataFilter.addRules(defaultDataBlockFilterRules);
258                 htmlFilter.addRules(defaultHtmlBlockFilterRules);
259             })();
260 
261 
262             // htmlparser fragment 中的 entities 处理
263             // el.innerHTML=" "
264             // http://yiminghe.javaeye.com/blog/788929
265             htmlFilter.addRules({
266                 text:function (text) {
267                     return text
268                         //.replace(/ /g, "\xa0")
269                         .replace(/\xa0/g, " ");
270                 }
271             });
272 
273 
274             var protectElementRegex = /<(a|area|img|input)\b([^>]*)>/gi,
275                 protectAttributeRegex = /\b(href|src|name)\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|(?:[^ "'>]+))/gi;
276             // ie 6-7 会将 关于 url 的 content value 替换为 dom value
277             // #a -> http://xxx/#a
278             // ../x.html -> http://xx/x.html
279             function protectAttributes(html) {
280                 return html.replace(protectElementRegex, function (element, tag, attributes) {
281                     return '<' + tag + attributes.replace(protectAttributeRegex, function (fullAttr, attrName) {
282                         // We should not rewrite the existed protected attributes,
283                         // e.g. clipboard content from editor. (#5218)
284                         if (attributes.indexOf('_ke_saved_' + attrName) == -1) {
285                             return ' _ke_saved_' + fullAttr + ' ' + fullAttr;
286                         }
287                         return fullAttr;
288                     }) + '>';
289                 });
290             }
291 
292             var protectedSourceMarker = '{ke_protected}';
293 
294             var protectElementsRegex = /(?:<style[^>]*>[\s\S]*<\/style>)|(?:<(:?link|meta|base)[^>]*>)/gi,
295                 encodedElementsRegex = /<ke:encoded>([^<]*)<\/ke:encoded>/gi;
296 
297             var protectElementNamesRegex = /(<\/?)((?:object|embed|param|html|body|head|title|script|noscript)[^>]*>)/gi,
298                 unprotectElementNamesRegex = /(<\/?)ke:((?:object|embed|param|html|body|head|title|script|noscript)[^>]*>)/gi;
299 
300             var protectSelfClosingRegex = /<ke:(param|embed)([^>]*?)\/?>(?!\s*<\/ke:\1)/gi;
301 
302             function protectSelfClosingElements(html) {
303                 return html.replace(protectSelfClosingRegex, '<ke:$1$2></ke:$1>');
304             }
305 
306             function protectElements(html) {
307                 return html.replace(protectElementsRegex, function (match) {
308                     return '<ke:encoded>' + encodeURIComponent(match) + '</ke:encoded>';
309                 });
310             }
311 
312             function unprotectElements(html) {
313                 return html.replace(encodedElementsRegex, function (match, encoded) {
314                     return decodeURIComponent(encoded);
315                 });
316             }
317 
318             function protectElementsNames(html) {
319                 return html.replace(protectElementNamesRegex, '$1ke:$2');
320             }
321 
322             function unprotectElementNames(html) {
323                 return html.replace(unprotectElementNamesRegex, '$1$2');
324             }
325 
326             editor.htmlDataProcessor = {
327                 dataFilter:dataFilter,
328                 htmlFilter:htmlFilter,
329                 // 编辑器 html 到外部 html
330                 // fixForBody , <body>t</body> => <body><p>t</p></body>
331                 toHtml:function (html) {
332                     // fixForBody = fixForBody || "p";
333                     // Now use our parser to make further fixes to the structure, as
334                     // well as apply the filter.
335                     //使用 htmlWriter 界面美观,加入额外文字节点\n,\t空白等
336                     var writer = new HtmlParser.BeautifyWriter(),
337                         n = new HtmlParser.Parser(html).parse();
338                     n.writeHtml(writer, htmlFilter);
339                     html = writer.getHtml();
340                     return html;
341                 },
342                 // 外部html进入编辑器
343                 toDataFormat:function (html, _dataFilter) {
344                     //可以传 wordFilter 或 dataFilter
345                     _dataFilter = _dataFilter || dataFilter;
346 
347                     html = protectAttributes(html);
348 
349                     // Protect elements than can't be set inside a DIV. E.g. IE removes
350                     // style tags from innerHTML. (#3710)
351                     html = protectElements(html);
352 
353                     // Certain elements has problem to go through DOM operation, protect
354                     // them by prefixing 'ke' namespace. (#3591)
355                     html = protectElementsNames(html);
356 
357                     // All none-IE browsers ignore self-closed custom elements,
358                     // protecting them into open-close. (#3591)
359                     html = protectSelfClosingElements(html);
360 
361                     // 标签不合法可能 parser 出错,这里先用浏览器帮我们建立棵合法的 dom 树的 html
362                     // Call the browser to help us fixing a possibly invalid HTML
363                     // structure.
364                     var div = new Node("<div>");
365                     // Add fake character to workaround IE comments bug. (#3801)
366                     div.html('a' + html);
367                     html = div.html().substr(1);
368 
369                     // Unprotect "some" of the protected elements at this point.
370                     html = unprotectElementNames(html);
371 
372                     html = unprotectElements(html);
373 
374                     // fixForBody = fixForBody || "p";
375                     // bug:qc #3710:使用 basicWriter ,去除无用的文字节点,标签间连续\n空白等
376 
377                     var writer = new HtmlParser.BasicWriter(),
378                         n = new HtmlParser.Parser(html).parse();
379 
380                     n.writeHtml(writer, _dataFilter);
381 
382                     html = writer.getHtml();
383 
384                     return html;
385                 },
386                 /*
387                  最精简html传送到server
388                  */
389                 toServer:function (html) {
390                     var writer = new HtmlParser.MinifyWriter(),
391                         n = new HtmlParser.Parser(html).parse();
392                     n.writeHtml(writer, htmlFilter);
393                     return writer.getHtml();
394                 }
395             };
396         }
397     };
398 }, {
399     requires:['./base']
400 });
401