Gumbo
0.9.0
A C library for parsing HTML.
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // 00003 // Licensed under the Apache License, Version 2.0 (the "License"); 00004 // you may not use this file except in compliance with the License. 00005 // You may obtain a copy of the License at 00006 // 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // 00009 // Unless required by applicable law or agreed to in writing, software 00010 // distributed under the License is distributed on an "AS IS" BASIS, 00011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00012 // See the License for the specific language governing permissions and 00013 // limitations under the License. 00014 // 00015 // Author: jdtang@google.com (Jonathan Tang) 00016 // 00017 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and 00018 // GUMBO_ as a prefix for enum constants (static constants get the Google-style 00019 // kGumbo prefix). 00020 00042 #ifndef GUMBO_GUMBO_H_ 00043 #define GUMBO_GUMBO_H_ 00044 00045 #include <stdbool.h> 00046 #include <stddef.h> 00047 00048 #ifdef __cplusplus 00049 extern "C" { 00050 #endif 00051 00062 typedef struct _GumboSourcePosition { 00063 unsigned int line; 00064 unsigned int column; 00065 unsigned int offset; 00066 } GumboSourcePosition; 00067 00072 extern const GumboSourcePosition kGumboEmptySourcePosition; 00073 00074 00084 typedef struct _GumboStringPiece { 00086 const char* data; 00087 00089 size_t length; 00090 } GumboStringPiece; 00091 00093 extern const GumboStringPiece kGumboEmptyString; 00094 00099 bool gumbo_string_equals( 00100 const GumboStringPiece* str1, const GumboStringPiece* str2); 00101 00106 bool gumbo_string_equals_ignore_case( 00107 const GumboStringPiece* str1, const GumboStringPiece* str2); 00108 00109 00119 typedef struct _GumboVector { 00123 void** data; 00124 00126 unsigned int length; 00127 00129 unsigned int capacity; 00130 } GumboVector; 00131 00133 extern const GumboVector kGumboEmptyVector; 00134 00139 int gumbo_vector_index_of(GumboVector* vector, void* element); 00140 00141 00154 typedef enum _GumboTag { 00155 // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element 00156 GUMBO_TAG_HTML, 00157 // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata 00158 GUMBO_TAG_HEAD, 00159 GUMBO_TAG_TITLE, 00160 GUMBO_TAG_BASE, 00161 GUMBO_TAG_LINK, 00162 GUMBO_TAG_META, 00163 GUMBO_TAG_STYLE, 00164 // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1 00165 GUMBO_TAG_SCRIPT, 00166 GUMBO_TAG_NOSCRIPT, 00167 // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections 00168 GUMBO_TAG_BODY, 00169 GUMBO_TAG_SECTION, 00170 GUMBO_TAG_NAV, 00171 GUMBO_TAG_ARTICLE, 00172 GUMBO_TAG_ASIDE, 00173 GUMBO_TAG_H1, 00174 GUMBO_TAG_H2, 00175 GUMBO_TAG_H3, 00176 GUMBO_TAG_H4, 00177 GUMBO_TAG_H5, 00178 GUMBO_TAG_H6, 00179 GUMBO_TAG_HGROUP, 00180 GUMBO_TAG_HEADER, 00181 GUMBO_TAG_FOOTER, 00182 GUMBO_TAG_ADDRESS, 00183 // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content 00184 GUMBO_TAG_P, 00185 GUMBO_TAG_HR, 00186 GUMBO_TAG_PRE, 00187 GUMBO_TAG_BLOCKQUOTE, 00188 GUMBO_TAG_OL, 00189 GUMBO_TAG_UL, 00190 GUMBO_TAG_LI, 00191 GUMBO_TAG_DL, 00192 GUMBO_TAG_DT, 00193 GUMBO_TAG_DD, 00194 GUMBO_TAG_FIGURE, 00195 GUMBO_TAG_FIGCAPTION, 00196 GUMBO_TAG_DIV, 00197 // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics 00198 GUMBO_TAG_A, 00199 GUMBO_TAG_EM, 00200 GUMBO_TAG_STRONG, 00201 GUMBO_TAG_SMALL, 00202 GUMBO_TAG_S, 00203 GUMBO_TAG_CITE, 00204 GUMBO_TAG_Q, 00205 GUMBO_TAG_DFN, 00206 GUMBO_TAG_ABBR, 00207 GUMBO_TAG_TIME, 00208 GUMBO_TAG_CODE, 00209 GUMBO_TAG_VAR, 00210 GUMBO_TAG_SAMP, 00211 GUMBO_TAG_KBD, 00212 GUMBO_TAG_SUB, 00213 GUMBO_TAG_SUP, 00214 GUMBO_TAG_I, 00215 GUMBO_TAG_B, 00216 GUMBO_TAG_MARK, 00217 GUMBO_TAG_RUBY, 00218 GUMBO_TAG_RT, 00219 GUMBO_TAG_RP, 00220 GUMBO_TAG_BDI, 00221 GUMBO_TAG_BDO, 00222 GUMBO_TAG_SPAN, 00223 GUMBO_TAG_BR, 00224 GUMBO_TAG_WBR, 00225 // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits 00226 GUMBO_TAG_INS, 00227 GUMBO_TAG_DEL, 00228 // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1 00229 GUMBO_TAG_IMAGE, 00230 GUMBO_TAG_IMG, 00231 GUMBO_TAG_IFRAME, 00232 GUMBO_TAG_EMBED, 00233 GUMBO_TAG_OBJECT, 00234 GUMBO_TAG_PARAM, 00235 GUMBO_TAG_VIDEO, 00236 GUMBO_TAG_AUDIO, 00237 GUMBO_TAG_SOURCE, 00238 GUMBO_TAG_TRACK, 00239 GUMBO_TAG_CANVAS, 00240 GUMBO_TAG_MAP, 00241 GUMBO_TAG_AREA, 00242 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml 00243 GUMBO_TAG_MATH, 00244 GUMBO_TAG_MI, 00245 GUMBO_TAG_MO, 00246 GUMBO_TAG_MN, 00247 GUMBO_TAG_MS, 00248 GUMBO_TAG_MTEXT, 00249 GUMBO_TAG_MGLYPH, 00250 GUMBO_TAG_MALIGNMARK, 00251 GUMBO_TAG_ANNOTATION_XML, 00252 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0 00253 GUMBO_TAG_SVG, 00254 GUMBO_TAG_FOREIGNOBJECT, 00255 GUMBO_TAG_DESC, 00256 // SVG title tags will have GUMBO_TAG_TITLE as with HTML. 00257 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data 00258 GUMBO_TAG_TABLE, 00259 GUMBO_TAG_CAPTION, 00260 GUMBO_TAG_COLGROUP, 00261 GUMBO_TAG_COL, 00262 GUMBO_TAG_TBODY, 00263 GUMBO_TAG_THEAD, 00264 GUMBO_TAG_TFOOT, 00265 GUMBO_TAG_TR, 00266 GUMBO_TAG_TD, 00267 GUMBO_TAG_TH, 00268 // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms 00269 GUMBO_TAG_FORM, 00270 GUMBO_TAG_FIELDSET, 00271 GUMBO_TAG_LEGEND, 00272 GUMBO_TAG_LABEL, 00273 GUMBO_TAG_INPUT, 00274 GUMBO_TAG_BUTTON, 00275 GUMBO_TAG_SELECT, 00276 GUMBO_TAG_DATALIST, 00277 GUMBO_TAG_OPTGROUP, 00278 GUMBO_TAG_OPTION, 00279 GUMBO_TAG_TEXTAREA, 00280 GUMBO_TAG_KEYGEN, 00281 GUMBO_TAG_OUTPUT, 00282 GUMBO_TAG_PROGRESS, 00283 GUMBO_TAG_METER, 00284 // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements 00285 GUMBO_TAG_DETAILS, 00286 GUMBO_TAG_SUMMARY, 00287 GUMBO_TAG_COMMAND, 00288 GUMBO_TAG_MENU, 00289 // Non-conforming elements that nonetheless appear in the HTML5 spec. 00290 // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features 00291 GUMBO_TAG_APPLET, 00292 GUMBO_TAG_ACRONYM, 00293 GUMBO_TAG_BGSOUND, 00294 GUMBO_TAG_DIR, 00295 GUMBO_TAG_FRAME, 00296 GUMBO_TAG_FRAMESET, 00297 GUMBO_TAG_NOFRAMES, 00298 GUMBO_TAG_ISINDEX, 00299 GUMBO_TAG_LISTING, 00300 GUMBO_TAG_XMP, 00301 GUMBO_TAG_NEXTID, 00302 GUMBO_TAG_NOEMBED, 00303 GUMBO_TAG_PLAINTEXT, 00304 GUMBO_TAG_RB, 00305 GUMBO_TAG_STRIKE, 00306 GUMBO_TAG_BASEFONT, 00307 GUMBO_TAG_BIG, 00308 GUMBO_TAG_BLINK, 00309 GUMBO_TAG_CENTER, 00310 GUMBO_TAG_FONT, 00311 GUMBO_TAG_MARQUEE, 00312 GUMBO_TAG_MULTICOL, 00313 GUMBO_TAG_NOBR, 00314 GUMBO_TAG_SPACER, 00315 GUMBO_TAG_TT, 00316 GUMBO_TAG_U, 00317 // Used for all tags that don't have special handling in HTML. 00318 GUMBO_TAG_UNKNOWN, 00319 // A marker value to indicate the end of the enum, for iterating over it. 00320 // Also used as the terminator for varargs functions that take tags. 00321 GUMBO_TAG_LAST, 00322 } GumboTag; 00323 00329 const char* gumbo_normalized_tagname(GumboTag tag); 00330 00341 void gumbo_tag_from_original_text(GumboStringPiece* text); 00342 00355 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); 00356 00361 const GumboTag gumbo_tag_enum(const char* tagname); 00362 00368 typedef enum _GumboAttributeNamespaceEnum { 00369 GUMBO_ATTR_NAMESPACE_NONE, 00370 GUMBO_ATTR_NAMESPACE_XLINK, 00371 GUMBO_ATTR_NAMESPACE_XML, 00372 GUMBO_ATTR_NAMESPACE_XMLNS, 00373 } GumboAttributeNamespaceEnum; 00374 00380 typedef struct _GumboAttribute { 00387 GumboAttributeNamespaceEnum attr_namespace; 00388 00393 const char* name; 00394 00399 GumboStringPiece original_name; 00400 00407 const char* value; 00408 00417 GumboStringPiece original_value; 00418 00420 GumboSourcePosition name_start; 00421 00427 GumboSourcePosition name_end; 00428 00430 GumboSourcePosition value_start; 00431 00433 GumboSourcePosition value_end; 00434 } GumboAttribute; 00435 00441 GumboAttribute* gumbo_get_attribute( 00442 const struct _GumboVector* attrs, const char* name); 00443 00448 typedef enum _GumboNodeType { 00450 GUMBO_NODE_DOCUMENT, 00452 GUMBO_NODE_ELEMENT, 00454 GUMBO_NODE_TEXT, 00456 GUMBO_NODE_CDATA, 00458 GUMBO_NODE_COMMENT, 00460 GUMBO_NODE_WHITESPACE 00461 } GumboNodeType; 00462 00467 typedef struct _GumboNode GumboNode; 00468 00470 typedef enum _GumboQuirksModeEnum { 00471 GUMBO_DOCTYPE_NO_QUIRKS, 00472 GUMBO_DOCTYPE_QUIRKS, 00473 GUMBO_DOCTYPE_LIMITED_QUIRKS 00474 } GumboQuirksModeEnum; 00475 00483 typedef enum _GumboNamespaceEnum { 00484 GUMBO_NAMESPACE_HTML, 00485 GUMBO_NAMESPACE_SVG, 00486 GUMBO_NAMESPACE_MATHML 00487 } GumboNamespaceEnum; 00488 00497 typedef enum _GumboParseFlags { 00502 GUMBO_INSERTION_NORMAL = 0, 00503 00510 GUMBO_INSERTION_BY_PARSER = 1 << 0, 00511 00523 GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, 00524 00525 // Value 1 << 2 was for a flag that has since been removed. 00526 00531 GUMBO_INSERTION_IMPLIED = 1 << 3, 00532 00539 GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, 00540 00542 GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, 00543 00545 GUMBO_INSERTION_FROM_IMAGE = 1 << 6, 00546 00552 GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, 00553 00555 GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, 00556 00558 GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, 00559 00564 GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, 00565 } GumboParseFlags; 00566 00567 00571 typedef struct _GumboDocument { 00577 GumboVector /* GumboNode* */ children; 00578 00579 // True if there was an explicit doctype token as opposed to it being omitted. 00580 bool has_doctype; 00581 00582 // Fields from the doctype token, copied verbatim. 00583 const char* name; 00584 const char* public_identifier; 00585 const char* system_identifier; 00586 00591 GumboQuirksModeEnum doc_type_quirks_mode; 00592 } GumboDocument; 00593 00598 typedef struct _GumboText { 00603 const char* text; 00604 00609 GumboStringPiece original_text; 00610 00615 GumboSourcePosition start_pos; 00616 } GumboText; 00617 00622 typedef struct _GumboElement { 00627 GumboVector /* GumboNode* */ children; 00628 00630 GumboTag tag; 00631 00633 GumboNamespaceEnum tag_namespace; 00634 00641 GumboStringPiece original_tag; 00642 00648 GumboStringPiece original_end_tag; 00649 00651 GumboSourcePosition start_pos; 00652 00654 GumboSourcePosition end_pos; 00655 00660 GumboVector /* GumboAttribute* */ attributes; 00661 } GumboElement; 00662 00667 struct _GumboNode { 00669 GumboNodeType type; 00670 00672 GumboNode* parent; 00673 00675 size_t index_within_parent; 00676 00682 GumboParseFlags parse_flags; 00683 00685 union { 00686 GumboDocument document; // For GUMBO_NODE_DOCUMENT. 00687 GumboElement element; // For GUMBO_NODE_ELEMENT. 00688 GumboText text; // For everything else. 00689 } v; 00690 }; 00691 00698 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. 00699 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); 00700 00705 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); 00706 00713 typedef struct _GumboOptions { 00715 GumboAllocatorFunction allocator; 00716 00718 GumboDeallocatorFunction deallocator; 00719 00724 void* userdata; 00725 00730 int tab_stop; 00731 00736 bool stop_on_first_error; 00737 00745 int max_errors; 00746 } GumboOptions; 00747 00749 extern const GumboOptions kGumboDefaultOptions; 00750 00752 typedef struct _GumboOutput { 00757 GumboNode* document; 00758 00763 GumboNode* root; 00764 00772 GumboVector /* GumboError */ errors; 00773 } GumboOutput; 00774 00782 struct _GumboOutput* gumbo_parse(const char* buffer); 00783 00788 struct _GumboOutput* gumbo_parse_with_options( 00789 const GumboOptions* options, const char* buffer, size_t buffer_length); 00790 00792 void gumbo_destroy_output( 00793 const struct _GumboOptions* options, GumboOutput* output); 00794 00795 00796 #ifdef __cplusplus 00797 } 00798 #endif 00799 00800 #endif // GUMBO_GUMBO_H_