Gumbo  0.9.0
A C library for parsing HTML.
gumbo.h
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 //
00003 // Licensed under the Apache License, Version 2.0 (the "License");
00004 // you may not use this file except in compliance with the License.
00005 // You may obtain a copy of the License at
00006 //
00007 //     http://www.apache.org/licenses/LICENSE-2.0
00008 //
00009 // Unless required by applicable law or agreed to in writing, software
00010 // distributed under the License is distributed on an "AS IS" BASIS,
00011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 // See the License for the specific language governing permissions and
00013 // limitations under the License.
00014 //
00015 // Author: jdtang@google.com (Jonathan Tang)
00016 //
00017 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
00018 // GUMBO_ as a prefix for enum constants (static constants get the Google-style
00019 // kGumbo prefix).
00020 
00042 #ifndef GUMBO_GUMBO_H_
00043 #define GUMBO_GUMBO_H_
00044 
00045 #include <stdbool.h>
00046 #include <stddef.h>
00047 
00048 #ifdef __cplusplus
00049 extern "C" {
00050 #endif
00051 
00062 typedef struct _GumboSourcePosition {
00063   unsigned int line;
00064   unsigned int column;
00065   unsigned int offset;
00066 } GumboSourcePosition;
00067 
00072 extern const GumboSourcePosition kGumboEmptySourcePosition;
00073 
00074 
00084 typedef struct _GumboStringPiece {
00086   const char* data;
00087 
00089   size_t length;
00090 } GumboStringPiece;
00091 
00093 extern const GumboStringPiece kGumboEmptyString;
00094 
00099 bool gumbo_string_equals(
00100     const GumboStringPiece* str1, const GumboStringPiece* str2);
00101 
00106 bool gumbo_string_equals_ignore_case(
00107     const GumboStringPiece* str1, const GumboStringPiece* str2);
00108 
00109 
00119 typedef struct _GumboVector {
00123   void** data;
00124 
00126   unsigned int length;
00127 
00129   unsigned int capacity;
00130 } GumboVector;
00131 
00133 extern const GumboVector kGumboEmptyVector;
00134 
00139 int gumbo_vector_index_of(GumboVector* vector, void* element);
00140 
00141 
00154 typedef enum _GumboTag {
00155   // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
00156   GUMBO_TAG_HTML,
00157   // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
00158   GUMBO_TAG_HEAD,
00159   GUMBO_TAG_TITLE,
00160   GUMBO_TAG_BASE,
00161   GUMBO_TAG_LINK,
00162   GUMBO_TAG_META,
00163   GUMBO_TAG_STYLE,
00164   // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
00165   GUMBO_TAG_SCRIPT,
00166   GUMBO_TAG_NOSCRIPT,
00167   // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
00168   GUMBO_TAG_BODY,
00169   GUMBO_TAG_SECTION,
00170   GUMBO_TAG_NAV,
00171   GUMBO_TAG_ARTICLE,
00172   GUMBO_TAG_ASIDE,
00173   GUMBO_TAG_H1,
00174   GUMBO_TAG_H2,
00175   GUMBO_TAG_H3,
00176   GUMBO_TAG_H4,
00177   GUMBO_TAG_H5,
00178   GUMBO_TAG_H6,
00179   GUMBO_TAG_HGROUP,
00180   GUMBO_TAG_HEADER,
00181   GUMBO_TAG_FOOTER,
00182   GUMBO_TAG_ADDRESS,
00183   // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
00184   GUMBO_TAG_P,
00185   GUMBO_TAG_HR,
00186   GUMBO_TAG_PRE,
00187   GUMBO_TAG_BLOCKQUOTE,
00188   GUMBO_TAG_OL,
00189   GUMBO_TAG_UL,
00190   GUMBO_TAG_LI,
00191   GUMBO_TAG_DL,
00192   GUMBO_TAG_DT,
00193   GUMBO_TAG_DD,
00194   GUMBO_TAG_FIGURE,
00195   GUMBO_TAG_FIGCAPTION,
00196   GUMBO_TAG_DIV,
00197   // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
00198   GUMBO_TAG_A,
00199   GUMBO_TAG_EM,
00200   GUMBO_TAG_STRONG,
00201   GUMBO_TAG_SMALL,
00202   GUMBO_TAG_S,
00203   GUMBO_TAG_CITE,
00204   GUMBO_TAG_Q,
00205   GUMBO_TAG_DFN,
00206   GUMBO_TAG_ABBR,
00207   GUMBO_TAG_TIME,
00208   GUMBO_TAG_CODE,
00209   GUMBO_TAG_VAR,
00210   GUMBO_TAG_SAMP,
00211   GUMBO_TAG_KBD,
00212   GUMBO_TAG_SUB,
00213   GUMBO_TAG_SUP,
00214   GUMBO_TAG_I,
00215   GUMBO_TAG_B,
00216   GUMBO_TAG_MARK,
00217   GUMBO_TAG_RUBY,
00218   GUMBO_TAG_RT,
00219   GUMBO_TAG_RP,
00220   GUMBO_TAG_BDI,
00221   GUMBO_TAG_BDO,
00222   GUMBO_TAG_SPAN,
00223   GUMBO_TAG_BR,
00224   GUMBO_TAG_WBR,
00225   // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
00226   GUMBO_TAG_INS,
00227   GUMBO_TAG_DEL,
00228   // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
00229   GUMBO_TAG_IMAGE,
00230   GUMBO_TAG_IMG,
00231   GUMBO_TAG_IFRAME,
00232   GUMBO_TAG_EMBED,
00233   GUMBO_TAG_OBJECT,
00234   GUMBO_TAG_PARAM,
00235   GUMBO_TAG_VIDEO,
00236   GUMBO_TAG_AUDIO,
00237   GUMBO_TAG_SOURCE,
00238   GUMBO_TAG_TRACK,
00239   GUMBO_TAG_CANVAS,
00240   GUMBO_TAG_MAP,
00241   GUMBO_TAG_AREA,
00242   // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
00243   GUMBO_TAG_MATH,
00244   GUMBO_TAG_MI,
00245   GUMBO_TAG_MO,
00246   GUMBO_TAG_MN,
00247   GUMBO_TAG_MS,
00248   GUMBO_TAG_MTEXT,
00249   GUMBO_TAG_MGLYPH,
00250   GUMBO_TAG_MALIGNMARK,
00251   GUMBO_TAG_ANNOTATION_XML,
00252   // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
00253   GUMBO_TAG_SVG,
00254   GUMBO_TAG_FOREIGNOBJECT,
00255   GUMBO_TAG_DESC,
00256   // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
00257   // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
00258   GUMBO_TAG_TABLE,
00259   GUMBO_TAG_CAPTION,
00260   GUMBO_TAG_COLGROUP,
00261   GUMBO_TAG_COL,
00262   GUMBO_TAG_TBODY,
00263   GUMBO_TAG_THEAD,
00264   GUMBO_TAG_TFOOT,
00265   GUMBO_TAG_TR,
00266   GUMBO_TAG_TD,
00267   GUMBO_TAG_TH,
00268   // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
00269   GUMBO_TAG_FORM,
00270   GUMBO_TAG_FIELDSET,
00271   GUMBO_TAG_LEGEND,
00272   GUMBO_TAG_LABEL,
00273   GUMBO_TAG_INPUT,
00274   GUMBO_TAG_BUTTON,
00275   GUMBO_TAG_SELECT,
00276   GUMBO_TAG_DATALIST,
00277   GUMBO_TAG_OPTGROUP,
00278   GUMBO_TAG_OPTION,
00279   GUMBO_TAG_TEXTAREA,
00280   GUMBO_TAG_KEYGEN,
00281   GUMBO_TAG_OUTPUT,
00282   GUMBO_TAG_PROGRESS,
00283   GUMBO_TAG_METER,
00284   // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
00285   GUMBO_TAG_DETAILS,
00286   GUMBO_TAG_SUMMARY,
00287   GUMBO_TAG_COMMAND,
00288   GUMBO_TAG_MENU,
00289   // Non-conforming elements that nonetheless appear in the HTML5 spec.
00290   // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
00291   GUMBO_TAG_APPLET,
00292   GUMBO_TAG_ACRONYM,
00293   GUMBO_TAG_BGSOUND,
00294   GUMBO_TAG_DIR,
00295   GUMBO_TAG_FRAME,
00296   GUMBO_TAG_FRAMESET,
00297   GUMBO_TAG_NOFRAMES,
00298   GUMBO_TAG_ISINDEX,
00299   GUMBO_TAG_LISTING,
00300   GUMBO_TAG_XMP,
00301   GUMBO_TAG_NEXTID,
00302   GUMBO_TAG_NOEMBED,
00303   GUMBO_TAG_PLAINTEXT,
00304   GUMBO_TAG_RB,
00305   GUMBO_TAG_STRIKE,
00306   GUMBO_TAG_BASEFONT,
00307   GUMBO_TAG_BIG,
00308   GUMBO_TAG_BLINK,
00309   GUMBO_TAG_CENTER,
00310   GUMBO_TAG_FONT,
00311   GUMBO_TAG_MARQUEE,
00312   GUMBO_TAG_MULTICOL,
00313   GUMBO_TAG_NOBR,
00314   GUMBO_TAG_SPACER,
00315   GUMBO_TAG_TT,
00316   GUMBO_TAG_U,
00317   // Used for all tags that don't have special handling in HTML.
00318   GUMBO_TAG_UNKNOWN,
00319   // A marker value to indicate the end of the enum, for iterating over it.
00320   // Also used as the terminator for varargs functions that take tags.
00321   GUMBO_TAG_LAST,
00322 } GumboTag;
00323 
00329 const char* gumbo_normalized_tagname(GumboTag tag);
00330 
00341 void gumbo_tag_from_original_text(GumboStringPiece* text);
00342 
00355 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
00356 
00361 const GumboTag gumbo_tag_enum(const char* tagname);
00362 
00368 typedef enum _GumboAttributeNamespaceEnum {
00369   GUMBO_ATTR_NAMESPACE_NONE,
00370   GUMBO_ATTR_NAMESPACE_XLINK,
00371   GUMBO_ATTR_NAMESPACE_XML,
00372   GUMBO_ATTR_NAMESPACE_XMLNS,
00373 } GumboAttributeNamespaceEnum;
00374 
00380 typedef struct _GumboAttribute {
00387   GumboAttributeNamespaceEnum attr_namespace;
00388 
00393   const char* name;
00394 
00399   GumboStringPiece original_name;
00400 
00407   const char* value;
00408 
00417   GumboStringPiece original_value;
00418 
00420   GumboSourcePosition name_start;
00421 
00427   GumboSourcePosition name_end;
00428 
00430   GumboSourcePosition value_start;
00431 
00433   GumboSourcePosition value_end;
00434 } GumboAttribute;
00435 
00441 GumboAttribute* gumbo_get_attribute(
00442     const struct _GumboVector* attrs, const char* name);
00443 
00448 typedef enum _GumboNodeType {
00450   GUMBO_NODE_DOCUMENT,
00452   GUMBO_NODE_ELEMENT,
00454   GUMBO_NODE_TEXT,
00456   GUMBO_NODE_CDATA,
00458   GUMBO_NODE_COMMENT,
00460   GUMBO_NODE_WHITESPACE
00461 } GumboNodeType;
00462 
00467 typedef struct _GumboNode GumboNode;
00468 
00470 typedef enum _GumboQuirksModeEnum {
00471   GUMBO_DOCTYPE_NO_QUIRKS,
00472   GUMBO_DOCTYPE_QUIRKS,
00473   GUMBO_DOCTYPE_LIMITED_QUIRKS
00474 } GumboQuirksModeEnum;
00475 
00483 typedef enum _GumboNamespaceEnum {
00484   GUMBO_NAMESPACE_HTML,
00485   GUMBO_NAMESPACE_SVG,
00486   GUMBO_NAMESPACE_MATHML
00487 } GumboNamespaceEnum;
00488 
00497 typedef enum _GumboParseFlags {
00502   GUMBO_INSERTION_NORMAL = 0,
00503 
00510   GUMBO_INSERTION_BY_PARSER = 1 << 0,
00511 
00523   GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
00524 
00525   // Value 1 << 2 was for a flag that has since been removed.
00526 
00531   GUMBO_INSERTION_IMPLIED = 1 << 3,
00532 
00539   GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
00540 
00542   GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
00543 
00545   GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
00546 
00552   GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
00553 
00555   GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
00556 
00558   GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
00559 
00564   GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
00565 } GumboParseFlags;
00566 
00567 
00571 typedef struct _GumboDocument {
00577   GumboVector /* GumboNode* */ children;
00578 
00579   // True if there was an explicit doctype token as opposed to it being omitted.
00580   bool has_doctype;
00581 
00582   // Fields from the doctype token, copied verbatim.
00583   const char* name;
00584   const char* public_identifier;
00585   const char* system_identifier;
00586 
00591   GumboQuirksModeEnum doc_type_quirks_mode;
00592 } GumboDocument;
00593 
00598 typedef struct _GumboText {
00603   const char* text;
00604 
00609   GumboStringPiece original_text;
00610 
00615   GumboSourcePosition start_pos;
00616 } GumboText;
00617 
00622 typedef struct _GumboElement {
00627   GumboVector /* GumboNode* */ children;
00628 
00630   GumboTag tag;
00631 
00633   GumboNamespaceEnum tag_namespace;
00634 
00641   GumboStringPiece original_tag;
00642 
00648   GumboStringPiece original_end_tag;
00649 
00651   GumboSourcePosition start_pos;
00652 
00654   GumboSourcePosition end_pos;
00655 
00660   GumboVector /* GumboAttribute* */ attributes;
00661 } GumboElement;
00662 
00667 struct _GumboNode {
00669   GumboNodeType type;
00670 
00672   GumboNode* parent;
00673 
00675   size_t index_within_parent;
00676 
00682   GumboParseFlags parse_flags;
00683 
00685   union {
00686     GumboDocument document;      // For GUMBO_NODE_DOCUMENT.
00687     GumboElement element;        // For GUMBO_NODE_ELEMENT.
00688     GumboText text;              // For everything else.
00689   } v;
00690 };
00691 
00698 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
00699 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
00700 
00705 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
00706 
00713 typedef struct _GumboOptions {
00715   GumboAllocatorFunction allocator;
00716 
00718   GumboDeallocatorFunction deallocator;
00719 
00724   void* userdata;
00725 
00730   int tab_stop;
00731 
00736   bool stop_on_first_error;
00737 
00745   int max_errors;
00746 } GumboOptions;
00747 
00749 extern const GumboOptions kGumboDefaultOptions;
00750 
00752 typedef struct _GumboOutput {
00757   GumboNode* document;
00758 
00763   GumboNode* root;
00764 
00772   GumboVector /* GumboError */ errors;
00773 } GumboOutput;
00774 
00782 struct _GumboOutput* gumbo_parse(const char* buffer);
00783 
00788 struct _GumboOutput* gumbo_parse_with_options(
00789     const GumboOptions* options, const char* buffer, size_t buffer_length);
00790 
00792 void gumbo_destroy_output(
00793     const struct _GumboOptions* options, GumboOutput* output);
00794 
00795 
00796 #ifdef __cplusplus
00797 }
00798 #endif
00799 
00800 #endif  // GUMBO_GUMBO_H_
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator