/** * @ignore * parse html to a hierarchy dom tree * @author yiminghe@gmail.com */ KISSY.add("html-parser/parser", function (S, dtd, Tag, Fragment, Cursor, Lexer, Document, Scanner) { /** * Html Parse Class * @param html * @param opts * @class KISSY.HtmlParser.Parser */ function Parser(html, opts) { // fake root node html = S.trim(html); this.originalHTML = html; // only allow condition // 1. start with <!doctype // 2. start with <!html // 3. start with <!body // 4. not start with <head // 5. not start with <meta if (/^(<!doctype|<html|<body)/i.test(html)) { html = "<document>" + html + "</document>"; } else { html = "<body>" + html + "</body>"; } this.lexer = new Lexer(html); this.opts = opts || {}; } Parser.prototype = { constructor:Parser, elements:function () { var root , doc, lexer = this.lexer, opts = this.opts; doc = root = lexer.nextNode(); if (root.tagName != 'document') { doc = new Document(); doc.appendChild(root); } doc.nodeType = 9; Scanner.getScanner("div").scan(root, lexer, opts); var body = fixBody(doc); if (body && opts['autoParagraph']) { autoParagraph(body); } post_process(doc); var originalHTML = this.originalHTML, fragment = new Fragment(), cs; if (/^(<!doctype|<html|<body)/i.test(originalHTML)) { cs = doc.childNodes; } else { cs = body.childNodes; } S.each(cs, function (c) { fragment.appendChild(c); }); return fragment; }, parse:function () { return this.elements(); } }; function fixBody(doc) { // 3 limit depth var body = findTagWithName(doc, "body", 3); if (body) { /** * <body> <li>2</li> <span>1</span> <li>2</li> <span>3</span> <li>2</li> </body> */ var parent = body.parentNode, silbing = parent.childNodes, bodyIndex = S.indexOf(body, silbing); if (bodyIndex != silbing.length - 1) { var fixes = silbing.slice(bodyIndex + 1, silbing.length); for (var i = 0; i < fixes.length; i++) { parent.removeChild(fixes[i]); if (fixes[i].tagName == "body") { S.each(fixes[i].childNodes, function (c) { body.appendChild(c); }); } else { body.appendChild(fixes[i]); } } } } return body; } function autoParagraph(doc) { var childNodes = doc.childNodes, c, i, pDtd = dtd['p'], needFix = 0; for (i = 0; i < childNodes.length; i++) { c = childNodes[i]; if (c.nodeType == 3 || (c.nodeType == 1 && pDtd[c.nodeName])) { needFix = 1; break; } } if (needFix) { var newChildren = [], holder = new Tag(); holder.nodeName = holder.tagName = "p"; for (i = 0; i < childNodes.length; i++) { c = childNodes[i]; if (c.nodeType == 3 || (c.nodeType == 1 && pDtd[c.nodeName])) { holder.appendChild(c); } else { if (holder.childNodes.length) { newChildren.push(holder); holder = holder.clone(); } newChildren.push(c); } } if (holder.childNodes.length) { newChildren.push(holder); } doc.empty(); for (i = 0; i < newChildren.length; i++) { doc.appendChild(newChildren[i]); } } } function findTagWithName(root, tagName, level) { if (level === 0) return 0; if (typeof level==='number') { level--; } var r, childNodes = root.childNodes; if (childNodes) { for (var i = 0; i < childNodes.length; i++) { if (childNodes[i].tagName === tagName) { return childNodes[i]; } if (r = findTagWithName(childNodes[i], tagName, level)) { return r; } } } return 0; } function post_process(doc) { // Space characters before the root html element, // and space characters at the start of the html element and before the head element, // will be dropped when the document is parsed; var childNodes = [].concat(doc.childNodes); for (var i = 0; i < childNodes.length; i++) { if (childNodes[i].nodeName == "html") { var html = childNodes[i]; for (var j = 0; j < i; j++) { if (childNodes[j].nodeType == 3 && !S.trim(childNodes[j].toHtml())) { doc.removeChild(childNodes[j]); } } while (html.firstChild && html.firstChild.nodeType == 3 && !S.trim(html.firstChild.toHtml())) { html.removeChild(html.firstChild); } break; } } } return Parser; }, { requires:[ './dtd', './nodes/tag', './nodes/fragment', './lexer/cursor', './lexer/lexer', './nodes/document', './scanner' ] });