diff --git a/.gitignore b/.gitignore index 6c7956e..bf9b71e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,85 +1,74 @@ -# Compilation artifacts -*.o -*.lo -*.la - -# Editor swap files -*.swp -*.swo -*.swn - -#emacs editor leftovers -*.*~ - -#diff leftovers -*.orig - -# gtest pieces -gtest -gtest-1.7.0 -third_party.tar.gz -testdata.tar.gz -.tar.gz -.zip - -# Other build artifacts -/Debug -/visualc/Debug -/visualc/Release -/visualc/gumbo.sdf -/visualc/gumbo.opensdf -/build -.log -.sdf -.opensdf -.deps -.dirstamp -.libs -Makefile -Makefile.in -aclocal.m4 -autom4te.cache -compile -config.guess -config.log -config.status -config.sub -configure -core -depcomp -gtest/ -gumbo.pc -gumbo_test -gumbo_test.log -gumbo_test.trs -install-sh -libtool -ltmain.sh -m4/ -missing -test-driver -test-suite.log - -# gyp android artifacts -gumbo_parser.target.mk - -# `make dist` artifacts -/gumbo-[0-9].[0-9].tar.gz -/gumbo-[0-9].[0-9]/ - -# Python dist artifacts -*.pyc -dist -build -python/gumbo.egg-info -python/gumbo/libgumbo.so - -# Example binaries -benchmark -clean_text -find_links -get_title -positions_of_class -prettyprint -serialize -well_formed +# Backup files left behind by the Emacs editor. +*~ + +# Lock files used by the Emacs editor. +.\#* + +# emacs auto recovery files from aborted edits +\#*\# + +#use ful for stashing files +*.orig +*.keep + +# Temporary files used by the vim editor. +.*.swp +.swp + +# A hidden file created by the Mac OS X Finder. +.DS_Store + +# Image thumbnail database created by windows +Thumbs.db + +# Various files created by Visual Studio +*.sln +*.suo +*.vcproj +*.user* +#*.rc +*.ncb +*.pch +*.dep +*.idb +*.exp +*.res +*.manifest +*.ilk +*.pdb +*.def +Release +Debug +BuildLog.htm + +# Various files and folders created by CMake +CMakeFiles +CMakeScripts +CMakeCache.txt +*.dir +ALL_BUILD* + + +# Misc files +*.svn +*.a +*.o +*.obj +*.lib +*.exe +*.dll +*.a +*.app +*.xcodeproj +*.pbxbtree +*.pbxindex +*.build +*.smp +*.pl +*.pyc +*.pyo +*.orig +*.bak +*.rar +build + diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c5a4ba..3995772 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ cmake_minimum_required( VERSION 3.0 ) project(gumbo) +find_package(GTest) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/.libs) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/.libs) @@ -28,3 +30,7 @@ endif() add_subdirectory(src/) add_subdirectory(examples/) +if( ${GTEST_FOUND} ) + add_subdirectory(tests/) +endif() + diff --git a/ChangeLog.txt b/ChangeLog.txt new file mode 100644 index 0000000..98d6953 --- /dev/null +++ b/ChangeLog.txt @@ -0,0 +1,28 @@ +List of Changes since the Fork +============================== +In reverse chronological order: + +- change update foreign attributes to remove xml:base and add xlink:arcrole to follow whatwg spec +- fixed minor memory leaks +- correctly handle text in form elements +- support for new tags including search and to better follow the latest whatwg parsing spec +- fix for handling

and
in foreign contexts +- Fix multiple warnings including cast to enum from void pointer +- Re-implement adjust_foreign_attributes() with a gperf hash +- Remove special handling of tag +- Remove special handling of tag +- Use realloc(3) instead of malloc(3) in enlarge_vector_if_full() +- Use realloc(3) instead of malloc(3) in maybe_resize_string_buffer() +- Make destroy_node() function non-recursive +- Fix signedness of some format specifiers +- Add maximum element nesting limit +- Remove custom allocator support +- Fix recording of source positions for end tags +- Fix TAGSET_INCLUDES macro to work properly with multiple bit flags +- Re-implement gumbo_normalize_svg_tagname() with a gperf hash +- Replace linear array search in adjust_svg_attributes() with a gperf hash +- Fix duplicate TagSet initializer being ignored in is_special_node() +- Add support for tag +- Add missing static qualifiers to hide symbols that shouldn't be extern +- Replace use of locale-dependant ctype.h functions with custom, ASCII-only equiv +- allow support for xhtml parsing rules controllable via GumboOptions use_xhtml_rules flag diff --git a/README.md b/README.md index 72a4d8c..c1ba5df 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,32 @@ -Gumbo - A pure-C HTML5 parser. -============ - +Sigil-Gumbo - A pure-C HTML5 parser. +=================================== -This is a customized version of gumbo built on top of on the following repository: +This is a customized and highly updated version of Google's gumbo html parser built +on top of on the following repository as its initial commit: https://github.com/vmg/gumbo-parser/tree/v1.0.0 -This repository has forked from the main repository to include memory improvements, and parser simplifications, speed improvements, and -interface changes that allow simple editing of the tree post parsing. -It has also been modified to supports the use of xhtml parsing rules (see the XMTML5 parsing comments in src/parser.c) and -modified to recognize all svg and mathml tags in the tag enum. - -Due to recognizing a larger set of tags, we use our own version of tag_perf.h which is a variation of a minimal perfect hash -function for our much larger set of tags. +Gumbo is an Apache Licensed implementation of the [HTML5 parsing algorithm][] implemented +as a pure C99 library with no outside dependencies. It's designed to serve +as a building block for other tools and libraries such as linters, +validators, templating languages, and refactoring and analysis tools. -You can not simply replace this version of gumbo with the one from google/gumbo-parser without breaking Sigil. +This repository has forked from the main repository to include memory improvements, and parser +simplifications, speed improvements, and interface changes that allow simple editing of the tree +post parsing. -Here is the remainder of the official README.md +It has also been modified to support the use of xhtml parsing rules (turned on and off by settings) +(see the XMTML5 parsing comments in src/parser.c) and modified to recognize all current html, svg +and mathml tags in the tag enum. -[![Build Status](https://travis-ci.org/google/gumbo-parser.svg?branch=master)](https://travis-ci.org/google/gumbo-parser) +Due to recognizing a larger set of tags, we use our own version of tag_perf.h which is a variation +of a minimal perfect hash function for our much larger set of tags. -Gumbo is an implementation of the [HTML5 parsing algorithm][] implemented -as a pure C99 library with no outside dependencies. It's designed to serve -as a building block for other tools and libraries such as linters, -validators, templating languages, and refactoring and analysis tools. +You can not simply replace this version of gumbo with the one from google/gumbo-parser without breaking Sigil. -Goals & features: +Original Goals & features: -* Fully conformant with the [HTML5 spec][]. +* Fully conformant with the latest HTML spec. * Robust and resilient to bad input. * Simple API that can be easily wrapped by other languages. * Support for source locations and pointers back to the original text. @@ -36,112 +35,36 @@ Goals & features: * Passes all [html5lib tests][], including the template tag. * Tested on over 2.5 billion pages from Google's index. -Non-goals: - -* Execution speed. Gumbo gains some of this by virtue of being written in - C, but it is not an important consideration for the intended use-case, and - was not a major design factor. -* Support for encodings other than UTF-8. For the most part, client code - can convert the input stream to UTF-8 text using another library before - processing. -* Mutability. Gumbo is intentionally designed to turn an HTML document into a - parse tree, and free that parse tree all at once. It's not designed to - persistently store nodes or subtrees outside of the parse tree, or to perform - arbitrary DOM mutations within your program. If you need this functionality, - we recommend translating the Gumbo parse tree into a mutable DOM - representation more suited for the particular needs of your program before - operating on it. -* C89 support. Most major compilers support C99 by now; the major exception - (Microsoft Visual Studio) should be able to compile this in C++ mode with - relatively few changes. (Bug reports welcome.) -* ~~Security. Gumbo was initially designed for a product that worked with - trusted input files only. We're working to harden this and make sure that it - behaves as expected even on malicious input, but for now, Gumbo should only be - run on trusted input or within a sandbox.~~ Gumbo underwent a number of - security fixes and passed Google's security review as of version 0.9.1. - -Wishlist (aka "We couldn't get these into the original release, but are -hoping to add them soon"): - -* Full-featured error reporting. -* Additional performance improvements. -* DOM wrapper library/libraries (possibly within other language bindings) -* Query libraries, to extract information from parse trees using CSS or XPATH. Installation ============ -To build and install the library, issue the standard UNIX incantation from -the root of the distribution: - -```bash -$ ./autogen.sh -$ ./configure -$ make -$ sudo make install -``` - -Gumbo comes with full pkg-config support, so you can use the pkg-config to -print the flags needed to link your program against it: +To build and install the library, use CMake version 3.0 or later -```bash -$ pkg-config --cflags gumbo # print compiler flags -$ pkg-config --libs gumbo # print linker flags -$ pkg-config --cflags --libs gumbo # print both -``` + git clone https://github.com/Sigil-Ebook/sigil-gumbo.git + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release ../sigil-gumbo + make -j4 -For example: -```bash -$ gcc my_program.c `pkg-config --cflags --libs gumbo` -``` +To run the html5lib tree-construction test suite from the build directory +after building + + cd python/gumbo + python3 ./html5lib_adapter_test.py -See the pkg-config man page for more info. -There are a number of sample programs in the examples/ directory. They're -built automatically by 'make', but can also be made individually with -`make ` (eg. `make clean_text`). -To run the unit tests, you'll need to have [googletest][] downloaded and -unzipped. The googletest maintainers recommend against using -`make install`; instead, symlink the root googletest directory to 'gtest' -inside gumbo's root directory, and then `make check`: +If GoogleTest was found during the build you should be able to run +our unit tests from the build directory as follows: -```bash -$ unzip gtest-1.6.0.zip -$ cd gumbo-* -$ ln -s ../gtest-1.6.0 gtest -$ make check -``` + cd bin + ./run_tests + -Gumbo's `make check` has code to automatically configure & build gtest and -then link in the library. - -Debian and Fedora users can install libgtest with: - -```bash -$ apt-get install libgtest-dev # Debian/Ubuntu -$ yum install gtest-devel # CentOS/Fedora -``` - -Note for Ubuntu users: libgtest-dev package only install source files. -You have to make libraries yourself using cmake: - - $ sudo apt-get install cmake - $ cd /usr/src/gtest - $ sudo cmake CMakeLists.txt - $ sudo make - $ sudo cp *.a /usr/lib - -The configure script will detect the presence of the library and use that -instead. - -Note that you need to have super user privileges to execute these commands. -On most distros, you can prefix the commands above with `sudo` to execute -them as the super user. - -Debian installs usually don't have `sudo` installed (Ubuntu however does.) -Switch users first with `su -`, then run `apt-get`. +There are a number of sample programs in the examples/ directory that are +compiled and stored in the build directory in bin automatically. Basic Usage =========== @@ -159,71 +82,52 @@ int main() { } ``` -See the API documentation and sample programs for more details. -A note on API/ABI compatibility -=============================== +Modifying this Software +======================= -We'll make a best effort to preserve API compatibility between releases. -The initial release is a 0.9 (beta) release to solicit comments from early -adopters, but if no major problems are found with the API, a 1.0 release -will follow shortly, and the API of that should be considered stable. If -changes are necessary, we follow [semantic versioning][]. +Many of the source files use gperf perfect hash algorithms and are +therefore generated. This archive includes pre-generated versions of +all needed source files. But if any of them need to be modified, +then new source files will need to be gnerated: -We make no such guarantees about the ABI, and it's very likely that -subsequent versions may require a recompile of client code. For this -reason, we recommend NOT using Gumbo data structures throughout a program, -and instead limiting them to a translation layer that picks out whatever -data is needed from the parse tree and then converts that to persistent -data structures more appropriate for the application. The API is -structured to encourage this use, with a single delete function for the -whole parse tree, and is not designed with mutation in mind. +- To rebuild the set of recognized htmnl5 elements do the following + from main directory: + + python3 gentags.py src/tags.in -Python usage -============ -To install the python bindings, make sure that the -C library is installed first, and then `sudo python setup.py install` from -the root of the distro. This installs a 'gumbo' module; `pydoc gumbo` -should tell you about it. +- To rebuild the set of svg name related fixups do the following + from the main directory: -Recommended best-practice for Python usage is to use one of the adapters to -an existing API (personally, I prefer BeautifulSoup) and write your program -in terms of those. The raw CTypes bindings should be considered building -blocks for higher-level libraries and rarely referenced directly. + gperf -m100 src/svg_tags.gperf > src/svg_tags.c + + +- To rebuild the set of svgt attribute name related fixups do the + following from the main directory: + + gperf -m100 src/svg_attrs.gperf > src/svg_attrs.c -External Bindings -================= - -The following language bindings are maintained by various contributors in -other repositories: - -* Ruby: - * [ruby-gumbo] by Nicolas Martyanoff - * [nokogumbo] by Sam Ruby -* Node.js: [node-gumbo-parser] by Karl Westin -* D: [gumbo-d] by Christopher Bertels -* Lua: [lua-gumbo] by Craig Barnes -* Objective-C: - * [ObjectiveGumbo] by Programming Thomas - * [OCGumbo] by TracyYih -* C#: [GumboBindings] by Vladimir Zotov -* PHP: [GumboPHP] by Paul Preece -* Perl: [HTML::Gumbo] by Ruslan Zakirov - -[ruby-gumbo]: https://github.com/nevir/ruby-gumbo -[nokogumbo]: https://github.com/rubys/nokogumbo -[node-gumbo-parser]: https://github.com/karlwestin/node-gumbo-parser -[gumbo-d]: https://github.com/bakkdoor/gumbo-d -[lua-gumbo]: https://github.com/craigbarnes/lua-gumbo -[OCGumbo]: https://github.com/tracy-e/OCGumbo -[ObjectiveGumbo]: https://github.com/programmingthomas/ObjectiveGumbo -[GumboBindings]: https://github.com/rgripper/GumboBindings -[GumboPHP]: https://github.com/BipSync/gumbo + +- To rebuild the set of foreign attribute name fixups do the + following from the main directory: + + gperf -m100 -n src/foreign_attrs.gperf > src/foreign_attrs.c + + + +See the API documentation from the archived google gumbo github site +and sample programs for more details. + +Recommended best-practice for Python usage is to use one of the adapters +as an existing API and write your program in terms of those. +(see the python/gumbo directory) for html5lib and gumbo and bs4 adapter code. + +The raw CTypes bindings should be considered building +blocks for higher-level libraries and rarely referenced directly. +[Archived Google Gumbo github site]: https://github.com/google/gumbo-parser [HTML5 parsing algorithm]: http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12 [HTML5 spec]: http://www.whatwg.org/specs/web-apps/current-work/multipage/ [html5lib tests]: https://github.com/html5lib/html5lib-tests [googletest]: https://code.google.com/p/googletest/ -[semantic versioning]: http://semver.org/ -[HTML::Gumbo]: https://metacpan.org/pod/HTML::Gumbo diff --git a/archived_upstream/README.md b/archived_upstream/README.md new file mode 100644 index 0000000..72a4d8c --- /dev/null +++ b/archived_upstream/README.md @@ -0,0 +1,229 @@ +Gumbo - A pure-C HTML5 parser. +============ + + +This is a customized version of gumbo built on top of on the following repository: + + https://github.com/vmg/gumbo-parser/tree/v1.0.0 + +This repository has forked from the main repository to include memory improvements, and parser simplifications, speed improvements, and +interface changes that allow simple editing of the tree post parsing. +It has also been modified to supports the use of xhtml parsing rules (see the XMTML5 parsing comments in src/parser.c) and +modified to recognize all svg and mathml tags in the tag enum. + +Due to recognizing a larger set of tags, we use our own version of tag_perf.h which is a variation of a minimal perfect hash +function for our much larger set of tags. + +You can not simply replace this version of gumbo with the one from google/gumbo-parser without breaking Sigil. + +Here is the remainder of the official README.md + +[![Build Status](https://travis-ci.org/google/gumbo-parser.svg?branch=master)](https://travis-ci.org/google/gumbo-parser) + +Gumbo is an implementation of the [HTML5 parsing algorithm][] implemented +as a pure C99 library with no outside dependencies. It's designed to serve +as a building block for other tools and libraries such as linters, +validators, templating languages, and refactoring and analysis tools. + +Goals & features: + +* Fully conformant with the [HTML5 spec][]. +* Robust and resilient to bad input. +* Simple API that can be easily wrapped by other languages. +* Support for source locations and pointers back to the original text. +* Support for fragment parsing. +* Relatively lightweight, with no outside dependencies. +* Passes all [html5lib tests][], including the template tag. +* Tested on over 2.5 billion pages from Google's index. + +Non-goals: + +* Execution speed. Gumbo gains some of this by virtue of being written in + C, but it is not an important consideration for the intended use-case, and + was not a major design factor. +* Support for encodings other than UTF-8. For the most part, client code + can convert the input stream to UTF-8 text using another library before + processing. +* Mutability. Gumbo is intentionally designed to turn an HTML document into a + parse tree, and free that parse tree all at once. It's not designed to + persistently store nodes or subtrees outside of the parse tree, or to perform + arbitrary DOM mutations within your program. If you need this functionality, + we recommend translating the Gumbo parse tree into a mutable DOM + representation more suited for the particular needs of your program before + operating on it. +* C89 support. Most major compilers support C99 by now; the major exception + (Microsoft Visual Studio) should be able to compile this in C++ mode with + relatively few changes. (Bug reports welcome.) +* ~~Security. Gumbo was initially designed for a product that worked with + trusted input files only. We're working to harden this and make sure that it + behaves as expected even on malicious input, but for now, Gumbo should only be + run on trusted input or within a sandbox.~~ Gumbo underwent a number of + security fixes and passed Google's security review as of version 0.9.1. + +Wishlist (aka "We couldn't get these into the original release, but are +hoping to add them soon"): + +* Full-featured error reporting. +* Additional performance improvements. +* DOM wrapper library/libraries (possibly within other language bindings) +* Query libraries, to extract information from parse trees using CSS or XPATH. + +Installation +============ + +To build and install the library, issue the standard UNIX incantation from +the root of the distribution: + +```bash +$ ./autogen.sh +$ ./configure +$ make +$ sudo make install +``` + +Gumbo comes with full pkg-config support, so you can use the pkg-config to +print the flags needed to link your program against it: + +```bash +$ pkg-config --cflags gumbo # print compiler flags +$ pkg-config --libs gumbo # print linker flags +$ pkg-config --cflags --libs gumbo # print both +``` + +For example: + +```bash +$ gcc my_program.c `pkg-config --cflags --libs gumbo` +``` + +See the pkg-config man page for more info. + +There are a number of sample programs in the examples/ directory. They're +built automatically by 'make', but can also be made individually with +`make ` (eg. `make clean_text`). + +To run the unit tests, you'll need to have [googletest][] downloaded and +unzipped. The googletest maintainers recommend against using +`make install`; instead, symlink the root googletest directory to 'gtest' +inside gumbo's root directory, and then `make check`: + +```bash +$ unzip gtest-1.6.0.zip +$ cd gumbo-* +$ ln -s ../gtest-1.6.0 gtest +$ make check +``` + +Gumbo's `make check` has code to automatically configure & build gtest and +then link in the library. + +Debian and Fedora users can install libgtest with: + +```bash +$ apt-get install libgtest-dev # Debian/Ubuntu +$ yum install gtest-devel # CentOS/Fedora +``` + +Note for Ubuntu users: libgtest-dev package only install source files. +You have to make libraries yourself using cmake: + + $ sudo apt-get install cmake + $ cd /usr/src/gtest + $ sudo cmake CMakeLists.txt + $ sudo make + $ sudo cp *.a /usr/lib + +The configure script will detect the presence of the library and use that +instead. + +Note that you need to have super user privileges to execute these commands. +On most distros, you can prefix the commands above with `sudo` to execute +them as the super user. + +Debian installs usually don't have `sudo` installed (Ubuntu however does.) +Switch users first with `su -`, then run `apt-get`. + +Basic Usage +=========== + +Within your program, you need to include "gumbo.h" and then issue a call to +`gumbo_parse`: + +```C +#include "gumbo.h" + +int main() { + GumboOutput* output = gumbo_parse("

Hello, World!

"); + // Do stuff with output->root + gumbo_destroy_output(&kGumboDefaultOptions, output); +} +``` + +See the API documentation and sample programs for more details. + +A note on API/ABI compatibility +=============================== + +We'll make a best effort to preserve API compatibility between releases. +The initial release is a 0.9 (beta) release to solicit comments from early +adopters, but if no major problems are found with the API, a 1.0 release +will follow shortly, and the API of that should be considered stable. If +changes are necessary, we follow [semantic versioning][]. + +We make no such guarantees about the ABI, and it's very likely that +subsequent versions may require a recompile of client code. For this +reason, we recommend NOT using Gumbo data structures throughout a program, +and instead limiting them to a translation layer that picks out whatever +data is needed from the parse tree and then converts that to persistent +data structures more appropriate for the application. The API is +structured to encourage this use, with a single delete function for the +whole parse tree, and is not designed with mutation in mind. + +Python usage +============ + +To install the python bindings, make sure that the +C library is installed first, and then `sudo python setup.py install` from +the root of the distro. This installs a 'gumbo' module; `pydoc gumbo` +should tell you about it. + +Recommended best-practice for Python usage is to use one of the adapters to +an existing API (personally, I prefer BeautifulSoup) and write your program +in terms of those. The raw CTypes bindings should be considered building +blocks for higher-level libraries and rarely referenced directly. + +External Bindings +================= + +The following language bindings are maintained by various contributors in +other repositories: + +* Ruby: + * [ruby-gumbo] by Nicolas Martyanoff + * [nokogumbo] by Sam Ruby +* Node.js: [node-gumbo-parser] by Karl Westin +* D: [gumbo-d] by Christopher Bertels +* Lua: [lua-gumbo] by Craig Barnes +* Objective-C: + * [ObjectiveGumbo] by Programming Thomas + * [OCGumbo] by TracyYih +* C#: [GumboBindings] by Vladimir Zotov +* PHP: [GumboPHP] by Paul Preece +* Perl: [HTML::Gumbo] by Ruslan Zakirov + +[ruby-gumbo]: https://github.com/nevir/ruby-gumbo +[nokogumbo]: https://github.com/rubys/nokogumbo +[node-gumbo-parser]: https://github.com/karlwestin/node-gumbo-parser +[gumbo-d]: https://github.com/bakkdoor/gumbo-d +[lua-gumbo]: https://github.com/craigbarnes/lua-gumbo +[OCGumbo]: https://github.com/tracy-e/OCGumbo +[ObjectiveGumbo]: https://github.com/programmingthomas/ObjectiveGumbo +[GumboBindings]: https://github.com/rgripper/GumboBindings +[GumboPHP]: https://github.com/BipSync/gumbo + +[HTML5 parsing algorithm]: http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12 +[HTML5 spec]: http://www.whatwg.org/specs/web-apps/current-work/multipage/ +[html5lib tests]: https://github.com/html5lib/html5lib-tests +[googletest]: https://code.google.com/p/googletest/ +[semantic versioning]: http://semver.org/ +[HTML::Gumbo]: https://metacpan.org/pod/HTML::Gumbo diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 407f731..e451bac 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -3,12 +3,34 @@ cmake_minimum_required(VERSION 3.0) -project(clean_text) +# project(examples) -set(SOURCES clean_text.cc) -add_executable(${PROJECT_NAME} ${SOURCES}) +if( APPLE ) + set(CMAKE_MACOSX_RPATH 1) +endif() + +if( UNIX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") +endif() + +set(CXX_APPS clean_text find_links prettyprint positions_of_class well_formed serialize ) + +set(C_APPS get_title ) target_include_directories(${PROJECT_NAME} BEFORE PUBLIC ${GUMBO_INCLUDE_DIRS}) -target_link_libraries(${PROJECT_NAME} ${GUMBO_LIBRARIES}) +foreach( app_name ${CXX_APPS} ) + set( app_source_file "${app_name}.cc" ) + add_executable( ${app_name} ${app_source_file} ) + target_link_libraries( ${app_name} ${GUMBO_LIBRARIES} ) +endforeach( app_name ${CXX_APPS} ) + +foreach( app_name ${C_APPS} ) + set( app_source_file "${app_name}.c" ) + add_executable( ${app_name} ${app_source_file} ) + target_link_libraries( ${app_name} ${GUMBO_LIBRARIES} ) +endforeach( app_name ${C_APPS} ) + + diff --git a/gentags.py b/gentags.py index 7aca501..98acee7 100755 --- a/gentags.py +++ b/gentags.py @@ -22,18 +22,18 @@ def generate_tag_headers(): open("src/tag_sizes.h", "wb") as tag_sizes, \ open("python/gumbo/gumboc_tags.py", "wb") as tag_py, \ open('src/tag.in', 'rb') as tagfile: - tag_py.write('TagNames = [\n') + tag_py.write('TagNames = [\n'.encode('utf-8')) for f in (tag_strings, tag_enum, tag_sizes): f.write(HEADER.format('tag').encode('utf-8')) for tag in tagfile: tag = tag.decode('utf-8').strip() tag_upper = tag.upper().replace('-', '_') - tag_py.write('\t"%s",\n' % tag_upper) + tag_py.write(('\t"%s",\n' % tag_upper).encode('utf-8')) tag_strings.write(('"%s",\n' % tag).encode('utf-8')) tag_enum.write(('GUMBO_TAG_%s,\n' % tag_upper).encode('utf-8')) tag_sizes.write(('%d, ' % len(tag)).encode('utf-8')) tag_sizes.write(b'\n') - tag_py.write(']\n') + tag_py.write(']\n'.encode('utf-8')) def generate_tag_perfect_hash(repetitions=200): raw = subprocess.check_output( diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py index df20936..93ecd69 100644 --- a/python/gumbo/gumboc_tags.py +++ b/python/gumbo/gumboc_tags.py @@ -195,6 +195,7 @@ "PARAM", "PATH", "PATTERN", + "PICTURE", "PLAINTEXT", "POLYGON", "POLYLINE", @@ -211,10 +212,12 @@ "S", "SAMP", "SCRIPT", + "SEARCH", "SECTION", "SELECT", "SEMANTICS", "SET", + "SLOT", "SMALL", "SOURCE", "SPACER", diff --git a/python/gumbo/html5lib_adapter.py b/python/gumbo/html5lib_adapter.py index e311047..adc3c4a 100644 --- a/python/gumbo/html5lib_adapter.py +++ b/python/gumbo/html5lib_adapter.py @@ -98,6 +98,8 @@ def __init__(self, tree): self.tree = tree def parse(self, text_or_file, **kwargs): + # need to tell sigil gumbo to use html parsing rules not xhtml parsing rules + kwargs["use_xhtml_rules"] = False try: text = text_or_file.read() except AttributeError: @@ -116,6 +118,8 @@ def parse(self, text_or_file, **kwargs): return self.tree.getDocument() def parseFragment(self, text_or_file, container, **kwargs): + # need to tell sigil gumbo to use html parsing rules not xhtml parsing rules + kwargs["use_xhtml_rules"] = False try: text = text_or_file.read() except AttributeError: diff --git a/python/gumbo/html5lib_adapter_test.py b/python/gumbo/html5lib_adapter_test.py index a39257b..9f4d9f1 100644 --- a/python/gumbo/html5lib_adapter_test.py +++ b/python/gumbo/html5lib_adapter_test.py @@ -18,7 +18,8 @@ import glob import os import re -import StringIO +import io + import unittest import warnings @@ -75,7 +76,7 @@ def isSectionHeading(self, line): def normaliseOutput(self, data): # Remove trailing newlines - for key, value in data.iteritems(): + for key, value in data.items(): if value.endswith("\n"): data[key] = value[:-1] return data @@ -125,9 +126,9 @@ def impl(self, inner_html, input, expected, errors): if inner_html: document = p.parseFragment( - StringIO.StringIO(input), inner_html.replace('math ', 'mathml ')) + io.StringIO(input), inner_html.replace('math ', 'mathml ')) else: - document = p.parse(StringIO.StringIO(input)) + document = p.parse(io.StringIO(input)) with warnings.catch_warnings(): # Etree serializer in html5lib uses a deprecated getchildren() API. @@ -143,8 +144,8 @@ def impl(self, inner_html, input, expected, errors): error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected, '\nReceived:', output]) - self.assertEquals(expected, output, - error_msg.encode('ascii', 'xmlcharrefreplace') + '\n') + self.assertEqual(expected, output, + error_msg.encode('ascii', 'xmlcharrefreplace') + b'\n') # TODO(jdtang): Check error messages, when there's full error support. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 82a8966..d00de09 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -43,9 +43,12 @@ if ( NOT GUMBO_IS_SUBTREE ) # Copy python scripts to build directory add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND cmake -E copy_directory ${CMAKE_SOURCE_DIR}/python ${TOP_BUILD_LEVEL}/python) + # Copy testdata to build directory + add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND cmake -E copy_directory + ${CMAKE_SOURCE_DIR}/testdata ${TOP_BUILD_LEVEL}/testdata) # Copy python module setup.py to build directory add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND cmake -E copy - ${CMAKE_SOURCE_DIR}/setup.py ${TOP_BUILD_LEVEL}) + ${CMAKE_SOURCE_DIR}/setup.py ${TOP_BUILD_LEVEL}) endif() if ( MSVC AND NOT GUMBO_STATIC_LIB ) @@ -74,6 +77,6 @@ if(MSVC) set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS} /LTCG") endif() -if( UNIX AND NOT APPLE ) +if( UNIX ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") endif() diff --git a/src/foreign_attrs.c b/src/foreign_attrs.c index d2b03c9..8df5837 100644 --- a/src/foreign_attrs.c +++ b/src/foreign_attrs.c @@ -1,6 +1,6 @@ -/* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: gperf -m100 -n foreign_attrs.gperf */ -/* Computed positions: -k'2,8' */ +/* C code produced by gperf version 3.0.3 */ +/* Command-line: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/gperf -m100 -n ./foreign_attrs.gperf */ +/* Computed positions: -k'8-9' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ @@ -26,10 +26,10 @@ && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) /* The character set is not based on ISO-646. */ -#error "gperf generated tables don't work with this execution character set. Please report a bug to ." +error "gperf generated tables don't work with this execution character set. Please report a bug to ." #endif -#line 1 "foreign_attrs.gperf" +#line 1 "./foreign_attrs.gperf" #include "replacement.h" #include @@ -49,7 +49,9 @@ inline #endif #endif static unsigned int -hash (register const char *str, register size_t len) +hash (str, len) + register const char *str; + register unsigned int len; { static const unsigned char asso_values[] = { @@ -63,9 +65,9 @@ hash (register const char *str, register size_t len) 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 2, - 11, 10, 11, 9, 7, 6, 11, 11, 1, 0, - 11, 5, 11, 11, 4, 11, 11, 11, 11, 11, - 11, 3, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 1, 11, 10, 4, 4, 11, 11, 3, 11, + 11, 5, 3, 11, 0, 11, 2, 11, 11, 11, + 11, 2, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, @@ -85,56 +87,54 @@ hash (register const char *str, register size_t len) switch (len) { default: + hval += asso_values[(unsigned char)str[8]]; + /*FALLTHROUGH*/ + case 8: hval += asso_values[(unsigned char)str[7]]; /*FALLTHROUGH*/ case 7: case 6: case 5: - case 4: - case 3: - case 2: - hval += asso_values[(unsigned char)str[1]]; break; } return hval; } -const ForeignAttrReplacement * -gumbo_get_foreign_attr_replacement (register const char *str, register size_t len) +const ForeignAttrReplacement * gumbo_get_foreign_attr_replacement (const char * str, size_t len) { static const unsigned char lengthtable[] = { - 5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8 + 5, 10, 13, 9, 13, 10, 11, 11, 10, 10, 8 }; static const ForeignAttrReplacement wordlist[] = { -#line 25 "foreign_attrs.gperf" +#line 25 "./foreign_attrs.gperf" {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, -#line 26 "foreign_attrs.gperf" - {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, -#line 24 "foreign_attrs.gperf" +#line 18 "./foreign_attrs.gperf" + {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, +#line 17 "./foreign_attrs.gperf" + {"xlink:arcrole", "arcrole", GUMBO_ATTR_NAMESPACE_XLINK}, +#line 24 "./foreign_attrs.gperf" {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, -#line 16 "foreign_attrs.gperf" +#line 16 "./foreign_attrs.gperf" {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 21 "foreign_attrs.gperf" +#line 22 "./foreign_attrs.gperf" {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 17 "foreign_attrs.gperf" - {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 18 "foreign_attrs.gperf" - {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 20 "foreign_attrs.gperf" +#line 21 "./foreign_attrs.gperf" {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 19 "foreign_attrs.gperf" +#line 26 "./foreign_attrs.gperf" + {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, +#line 19 "./foreign_attrs.gperf" + {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, +#line 20 "./foreign_attrs.gperf" {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, -#line 23 "foreign_attrs.gperf" - {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, -#line 22 "foreign_attrs.gperf" - {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML} +#line 23 "./foreign_attrs.gperf" + {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML} }; if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) { - register unsigned int key = hash (str, len); + unsigned int key = hash (str, len); if (key <= MAX_HASH_VALUE) if (len == lengthtable[key]) diff --git a/src/foreign_attrs.gperf b/src/foreign_attrs.gperf index 8824b2a..215f30c 100644 --- a/src/foreign_attrs.gperf +++ b/src/foreign_attrs.gperf @@ -14,12 +14,12 @@ ForeignAttrReplacement; %% "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:arcrole", "arcrole", GUMBO_ATTR_NAMESPACE_XLINK "xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK "xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK "xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK "xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK "xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK -"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML "xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML "xml:space", "space", GUMBO_ATTR_NAMESPACE_XML "xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS diff --git a/src/gumbo.h b/src/gumbo.h index b579c85..ed1beb5 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -559,7 +559,7 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); /** * Input struct containing configuration options for the parser. - * These let you specify alternate memory managers, provide different error + * These let you provide different error * handling, etc. * Use kGumboDefaultOptions for sensible defaults, and only set what you need. */ @@ -599,6 +599,7 @@ typedef struct GumboInternalOptions { * Default: 50 */ int max_errors; + } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ diff --git a/src/parser.c b/src/parser.c index eec282c..fb2a473 100644 --- a/src/parser.c +++ b/src/parser.c @@ -55,11 +55,11 @@ static bool handle_in_template(GumboParser*, GumboToken*); static void free_node(GumboNode* node); const GumboOptions kGumboDefaultOptions = { - 4, /* tab_stop */ - true, /* use_xhtml_rules */ - false, /* stop_on_first_error */ - 400, /* max_tree_depth */ - 50, /* max_errors */ + 4, /* tab_stop */ + true, /* use_xhtml_rules */ + false, /* stop_on_first_error */ + 400, /* max_tree_depth */ + 50, /* max_errors */ }; static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); @@ -383,6 +383,11 @@ static void parser_state_destroy(GumboParser* parser) { if (state->_fragment_ctx) { free_node(state->_fragment_ctx); } + if (state->_form_element) { + if (state->_form_element->parse_flags & GUMBO_INSERTION_BY_PARSER) { + free_node(state->_form_element); + } + } gumbo_vector_destroy(&state->_active_formatting_elements); gumbo_vector_destroy(&state->_open_elements); gumbo_vector_destroy(&state->_template_insertion_modes); @@ -1431,7 +1436,7 @@ static bool is_special_node(const GumboNode* node) { TAG(IFRAME), TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING), TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), TAG(PARAM), - TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), TAG(SELECT), TAG(STYLE), + TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SEARCH), TAG(SECTION), TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), @@ -2476,6 +2481,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { case GUMBO_TAG_NAV: case GUMBO_TAG_OL: case GUMBO_TAG_P: + case GUMBO_TAG_SEARCH: case GUMBO_TAG_SECTION: case GUMBO_TAG_SUMMARY: case GUMBO_TAG_UL: { @@ -2948,6 +2954,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { case GUMBO_TAG_NAV: case GUMBO_TAG_OL: case GUMBO_TAG_PRE: + case GUMBO_TAG_SEARCH: case GUMBO_TAG_SECTION: case GUMBO_TAG_SUMMARY: case GUMBO_TAG_UL: { @@ -2971,7 +2978,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { generate_implied_end_tags(parser, GUMBO_TAG_LAST); if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) { parser_add_parse_error(parser, token); - return false; + // was return false + success = false; } while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM)); return success; @@ -2995,7 +3003,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { if (get_current_node(parser) != node) { parser_add_parse_error(parser, token); result = false; - } else record_end_of_element(token, &node->v.element); + } // else record_end_of_element(token, &node->v.element); GumboVector* open_elements = &state->_open_elements; int index = gumbo_vector_index_of(open_elements, node); @@ -3177,8 +3185,10 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { switch (token->type) { case GUMBO_TOKEN_CHARACTER: case GUMBO_TOKEN_WHITESPACE: - case GUMBO_TOKEN_NULL: - if (node_tag_in_set(get_current_node(parser), (const gumbo_tagset) { TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR) })) { + /* + * case GUMBO_TOKEN_NULL: + * if (node_tag_in_set(get_current_node(parser), (const gumbo_tagset) { TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR) })) { + */ // The "pending table character tokens" list described in the spec is // nothing more than the TextNodeBufferState. We accumulate text tokens // as normal, except that when we go to flush them in the handle_in_table_text, @@ -3189,7 +3199,10 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { state->_reprocess_current_token = true; set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT); return true; - } + /* + * } + */ + case GUMBO_TOKEN_DOCTYPE: parser_add_parse_error(parser, token); ignore_token(parser); @@ -4293,23 +4306,17 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { /* Parse error */ parser_add_parse_error(parser, token); - /* - * Fragment case: If the parser was originally created for the HTML - * fragment parsing algorithm, then act as described in the "any other - * start tag" entry below. - */ - if (!is_fragment_parser(parser)) { - do { + while( !(is_mathml_integration_point(get_current_node(parser)) || + is_html_integration_point(get_current_node(parser)) || + get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) ) { + pop_current_node(parser); - } while(!(is_mathml_integration_point(get_current_node(parser)) || - is_html_integration_point(get_current_node(parser)) || - get_current_node(parser)->v.element.tag_namespace == - GUMBO_NAMESPACE_HTML)); - parser->_parser_state->_reprocess_current_token = true; - return false; } - - assert(token->type == GUMBO_TOKEN_START_TAG); + /* + * parser->_parser_state->_reprocess_current_token = true; + * return false; + */ + return handle_html_content(parser, token); } if (token->type == GUMBO_TOKEN_START_TAG) { @@ -4492,6 +4499,29 @@ static void fragment_parser_init( // 10. reset_insertion_mode_appropriately(parser); + +#if 0 + // 11. + if (ctx_has_form_ancestor + || (fragment_context == GUMBO_TAG_FORM + && fragment_namespace == GUMBO_NAMESPACE_HTML)) { + GumboNode * form_ancestor = create_node(GUMBO_NODE_ELEMENT); + form_ancestor->parent = NULL; + form_ancestor->index_within_parent = -1; + form_ancestor->parse_flags = GUMBO_INSERTION_BY_PARSER; + gumbo_vector_init(1, &form_ancestor->v.element.children); + form_ancestor->v.element.tag = GUMBO_TAG_FORM; + form_ancestor->v.element.tag_namespace = GUMBO_NAMESPACE_HTML; + form_ancestor->v.element.original_tag = kGumboEmptyString; + form_ancestor->v.element.original_end_tag = kGumboEmptyString; + form_ancestor->v.element.start_pos = kGumboEmptySourcePosition; + form_ancestor->v.element.end_pos = kGumboEmptySourcePosition; + gumbo_vector_init(1, &form_ancestor->v.element.attributes); + }; + parser->_parser_state->_form_element = form_ancestor; + } +#endif + } GumboOutput* gumbo_parse(const char* buffer) { @@ -4625,13 +4655,31 @@ GumboOutput* gumbo_parse_fragment( } } +#if 0 + if (!state->_reprocess_current_token) { + // we are done processing this token + if (!parser._options->use_xhtml_rules) { + // in not using xhtml rules then handle + // non void html element start tag with trailing solidus errors + if (token.type == GUMBO_TOKEN_START_TAG && + token.v.start_tag.is_self_closing && + !state->_self_closing_flag_acknowledged) { + GumboError* error = parser_add_parse_error(&parser, &token); + if (error) { + error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; + } + } + } + } +#else if (!state->_self_closing_flag_acknowledged) { GumboError* error = parser_add_parse_error(&parser, &token); if (error) { error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; } } - +#endif + // use of unlikely might help here but does not exist on windows if (state->_open_elements.length > max_tree_depth) { /* this block is unlikely to be taken */ diff --git a/src/tag.in b/src/tag.in index c2c1678..11abd32 100644 --- a/src/tag.in +++ b/src/tag.in @@ -194,6 +194,7 @@ p param path pattern +picture plaintext polygon polyline @@ -210,10 +211,12 @@ ruby s samp script +search section select semantics set +slot small source spacer diff --git a/src/tag_enum.h b/src/tag_enum.h index 14cb5be..f47ce61 100644 --- a/src/tag_enum.h +++ b/src/tag_enum.h @@ -197,6 +197,7 @@ GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PATH, GUMBO_TAG_PATTERN, +GUMBO_TAG_PICTURE, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_POLYGON, GUMBO_TAG_POLYLINE, @@ -213,10 +214,12 @@ GUMBO_TAG_RUBY, GUMBO_TAG_S, GUMBO_TAG_SAMP, GUMBO_TAG_SCRIPT, +GUMBO_TAG_SEARCH, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT, GUMBO_TAG_SEMANTICS, GUMBO_TAG_SET, +GUMBO_TAG_SLOT, GUMBO_TAG_SMALL, GUMBO_TAG_SOURCE, GUMBO_TAG_SPACER, diff --git a/src/tag_perf.h b/src/tag_perf.h index 5988bd1..cb6ecbb 100644 --- a/src/tag_perf.h +++ b/src/tag_perf.h @@ -6,32 +6,32 @@ tag_hash (register const char *str, register unsigned int len) { static unsigned short asso_values[] = { - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 22, - 17, 12, 11, 9, 5, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 15, 182, 165, 120, 17, - 25, 160, 261, 124, 7, 214, 142, 13, 61, 54, - 58, 6, 4, 48, 9, 204, 14, 49, 199, 136, - 705, 705, 705, 705, 705, 705, 705, 15, 182, 165, - 120, 17, 25, 160, 261, 124, 7, 214, 142, 13, - 61, 54, 58, 6, 4, 48, 9, 204, 14, 49, - 199, 136, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705, 705, 705, 705, 705, - 705, 705, 705, 705, 705, 705 + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 18, + 17, 12, 11, 10, 3, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 7, 190, 210, 156, 13, + 53, 166, 303, 56, 3, 174, 114, 15, 67, 40, + 32, 2, 2, 25, 7, 220, 118, 12, 96, 75, + 692, 692, 692, 692, 692, 692, 692, 7, 190, 210, + 156, 13, 53, 166, 303, 56, 3, 174, 114, 15, + 67, 40, 32, 2, 2, 25, 7, 220, 118, 12, + 96, 75, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692, 692, 692, 692, 692, + 692, 692, 692, 692, 692, 692 }; register unsigned int hval = len; @@ -57,6 +57,7 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_Q , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -64,16 +65,9 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_LAST -, GUMBO_TAG_Q -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST , GUMBO_TAG_TR , GUMBO_TAG_LAST -, GUMBO_TAG_LAST +, GUMBO_TAG_A , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_RT @@ -83,485 +77,480 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_TT , GUMBO_TAG_LAST -, GUMBO_TAG_A -, GUMBO_TAG_LAST -, GUMBO_TAG_MTR -, GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MTR , GUMBO_TAG_LAST -, GUMBO_TAG_VAR , GUMBO_TAG_LAST , GUMBO_TAG_MARKER +, GUMBO_TAG_AREA , GUMBO_TAG_LAST -, GUMBO_TAG_MERROR -, GUMBO_TAG_EM , GUMBO_TAG_LAST +, GUMBO_TAG_ARTICLE , GUMBO_TAG_LAST -, GUMBO_TAG_METER +, GUMBO_TAG_MERROR , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_ARTICLE -, GUMBO_TAG_MTEXT +, GUMBO_TAG_METER , GUMBO_TAG_LAST -, GUMBO_TAG_AREA , GUMBO_TAG_MARQUEE -, GUMBO_TAG_LAST +, GUMBO_TAG_EM , GUMBO_TAG_META -, GUMBO_TAG_TREF +, GUMBO_TAG_MTEXT , GUMBO_TAG_MTABLE -, GUMBO_TAG_FRAMESET +, GUMBO_TAG_LAST , GUMBO_TAG_METADATA +, GUMBO_TAG_S , GUMBO_TAG_LAST +, GUMBO_TAG_STRIKE +, GUMBO_TAG_MSQRT +, GUMBO_TAG_SET , GUMBO_TAG_TEMPLATE , GUMBO_TAG_LAST -, GUMBO_TAG_FRAME -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MSROW , GUMBO_TAG_LAST +, GUMBO_TAG_PARAM , GUMBO_TAG_LAST +, GUMBO_TAG_PRE , GUMBO_TAG_LAST -, GUMBO_TAG_FETILE +, GUMBO_TAG_P +, GUMBO_TAG_MSTYLE +, GUMBO_TAG_MS +, GUMBO_TAG_RP +, GUMBO_TAG_MROOT , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_SPACER +, GUMBO_TAG_MROW , GUMBO_TAG_LAST -, GUMBO_TAG_FEFUNCR -, GUMBO_TAG_FEMERGE -, GUMBO_TAG_FETURBULENCE -, GUMBO_TAG_MSQRT , GUMBO_TAG_LAST -, GUMBO_TAG_FEMERGENODE -, GUMBO_TAG_STRIKE -, GUMBO_TAG_MROOT -, GUMBO_TAG_SET , GUMBO_TAG_LAST +, GUMBO_TAG_FRAMESET , GUMBO_TAG_LAST -, GUMBO_TAG_FEFUNCA -, GUMBO_TAG_MOVER +, GUMBO_TAG_TREF +, GUMBO_TAG_FRAME , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MSTYLE +, GUMBO_TAG_SAMP +, GUMBO_TAG_APPLET +, GUMBO_TAG_MPRESCRIPTS , GUMBO_TAG_LAST -, GUMBO_TAG_PARAM +, GUMBO_TAG_SEMANTICS +, GUMBO_TAG_TITLE +, GUMBO_TAG_MAP , GUMBO_TAG_LAST -, GUMBO_TAG_S +, GUMBO_TAG_MSPACE +, GUMBO_TAG_FETILE , GUMBO_TAG_LAST -, GUMBO_TAG_PRE -, GUMBO_TAG_FORM , GUMBO_TAG_LAST -, GUMBO_TAG_TFOOT +, GUMBO_TAG_TIME +, GUMBO_TAG_IMAGE +, GUMBO_TAG_MO +, GUMBO_TAG_FETURBULENCE , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_FOREIGNOBJECT +, GUMBO_TAG_FEMERGE , GUMBO_TAG_LAST -, GUMBO_TAG_NAV , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_FEMERGENODE +, GUMBO_TAG_ASIDE +, GUMBO_TAG_PROGRESS +, GUMBO_TAG_STOP , GUMBO_TAG_FESPOTLIGHT -, GUMBO_TAG_MS -, GUMBO_TAG_MENUITEM -, GUMBO_TAG_FEOFFSET , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_TFOOT +, GUMBO_TAG_I +, GUMBO_TAG_FORM +, GUMBO_TAG_FOREIGNOBJECT , GUMBO_TAG_MENCLOSE -, GUMBO_TAG_P +, GUMBO_TAG_FEPOINTLIGHT +, GUMBO_TAG_MENUITEM +, GUMBO_TAG_OPTGROUP +, GUMBO_TAG_PATTERN +, GUMBO_TAG_FEOFFSET , GUMBO_TAG_LAST -, GUMBO_TAG_MSROW , GUMBO_TAG_LAST -, GUMBO_TAG_FEPOINTLIGHT -, GUMBO_TAG_RP -, GUMBO_TAG_MO -, GUMBO_TAG_MROW , GUMBO_TAG_LAST +, GUMBO_TAG_STYLE , GUMBO_TAG_LAST +, GUMBO_TAG_TEXT +, GUMBO_TAG_FEFUNCR +, GUMBO_TAG_MI +, GUMBO_TAG_IFRAME +, GUMBO_TAG_TEXTAREA +, GUMBO_TAG_VAR +, GUMBO_TAG_FEFUNCA , GUMBO_TAG_LAST +, GUMBO_TAG_SPAN +, GUMBO_TAG_TSPAN +, GUMBO_TAG_FIELDSET , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_SPACER +, GUMBO_TAG_FOOTER +, GUMBO_TAG_FEIMAGE , GUMBO_TAG_LAST +, GUMBO_TAG_ANIMATECOLOR , GUMBO_TAG_LAST -, GUMBO_TAG_MPRESCRIPTS -, GUMBO_TAG_SEMANTICS , GUMBO_TAG_LAST +, GUMBO_TAG_NOSCRIPT +, GUMBO_TAG_MLABELEDTR +, GUMBO_TAG_MAIN +, GUMBO_TAG_ANIMATE , GUMBO_TAG_MN -, GUMBO_TAG_SAMP -, GUMBO_TAG_LAST -, GUMBO_TAG_DT +, GUMBO_TAG_OPTION , GUMBO_TAG_LAST -, GUMBO_TAG_MSPACE -, GUMBO_TAG_FOOTER , GUMBO_TAG_LAST +, GUMBO_TAG_ALTGLYPHITEM , GUMBO_TAG_LAST -, GUMBO_TAG_APPLET -, GUMBO_TAG_MAP , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_PATTERN , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_FONT +, GUMBO_TAG_ANIMATETRANSFORM , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_SELECT +, GUMBO_TAG_SMALL +, GUMBO_TAG_INPUT +, GUMBO_TAG_FEMORPHOLOGY +, GUMBO_TAG_PLAINTEXT , GUMBO_TAG_LAST +, GUMBO_TAG_FONT +, GUMBO_TAG_DT +, GUMBO_TAG_MSLINE , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_INS , GUMBO_TAG_LAST -, GUMBO_TAG_DATALIST -, GUMBO_TAG_RADIALGRADIENT +, GUMBO_TAG_XMP +, GUMBO_TAG_MALIGNGROUP +, GUMBO_TAG_MOVER , GUMBO_TAG_DATA -, GUMBO_TAG_TITLE -, GUMBO_TAG_FONT_FACE_FORMAT , GUMBO_TAG_FONT_FACE -, GUMBO_TAG_TIME -, GUMBO_TAG_LAST -, GUMBO_TAG_LAST +, GUMBO_TAG_FONT_FACE_FORMAT , GUMBO_TAG_LAST +, GUMBO_TAG_DATALIST +, GUMBO_TAG_RADIALGRADIENT , GUMBO_TAG_FONT_FACE_NAME -, GUMBO_TAG_PROGRESS -, GUMBO_TAG_STOP -, GUMBO_TAG_IMAGE -, GUMBO_TAG_LAST -, GUMBO_TAG_IFRAME -, GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_NOSCRIPT -, GUMBO_TAG_TSPAN -, GUMBO_TAG_MFENCED -, GUMBO_TAG_FIELDSET -, GUMBO_TAG_MLABELEDTR -, GUMBO_TAG_FEDISTANTLIGHT -, GUMBO_TAG_SPAN -, GUMBO_TAG_OPTGROUP -, GUMBO_TAG_OPTION +, GUMBO_TAG_SLOT +, GUMBO_TAG_NONE +, GUMBO_TAG_ALTGLYPHDEF +, GUMBO_TAG_NOFRAMES , GUMBO_TAG_LAST -, GUMBO_TAG_FEIMAGE -, GUMBO_TAG_ALTGLYPHITEM +, GUMBO_TAG_TRACK , GUMBO_TAG_BR , GUMBO_TAG_LAST -, GUMBO_TAG_FEFLOOD , GUMBO_TAG_LAST -, GUMBO_TAG_NOFRAMES -, GUMBO_TAG_NONE , GUMBO_TAG_LAST -, GUMBO_TAG_RECT , GUMBO_TAG_LAST -, GUMBO_TAG_DETAILS -, GUMBO_TAG_ALTGLYPHDEF -, GUMBO_TAG_FEMORPHOLOGY -, GUMBO_TAG_ACRONYM , GUMBO_TAG_LAST +, GUMBO_TAG_MARK +, GUMBO_TAG_VIEW , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_VIEW -, GUMBO_TAG_ASIDE +, GUMBO_TAG_STRONG +, GUMBO_TAG_POLYLINE +, GUMBO_TAG_DETAILS +, GUMBO_TAG_WBR +, GUMBO_TAG_ANIMATEMOTION , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MFRAC -, GUMBO_TAG_MPADDED -, GUMBO_TAG_DEFS -, GUMBO_TAG_STYLE -, GUMBO_TAG_ANIMATECOLOR -, GUMBO_TAG_MAIN , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_FEGAUSSIANBLUR , GUMBO_TAG_LAST -, GUMBO_TAG_SELECT -, GUMBO_TAG_SMALL -, GUMBO_TAG_ANIMATE , GUMBO_TAG_LAST -, GUMBO_TAG_MSLINE -, GUMBO_TAG_STRONG -, GUMBO_TAG_TABLE -, GUMBO_TAG_ANIMATETRANSFORM -, GUMBO_TAG_FECOMPONENTTRANSFER -, GUMBO_TAG_MLONGDIV -, GUMBO_TAG_SCRIPT -, GUMBO_TAG_PLAINTEXT -, GUMBO_TAG_FEFUNCG -, GUMBO_TAG_FECOMPOSITE +, GUMBO_TAG_MPADDED , GUMBO_TAG_LAST -, GUMBO_TAG_FEDISPLACEMENTMAP -, GUMBO_TAG_TEXT -, GUMBO_TAG_MALIGNGROUP +, GUMBO_TAG_DIR , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_WBR +, GUMBO_TAG_TABLE , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MASK , GUMBO_TAG_LAST +, GUMBO_TAG_MSTACK +, GUMBO_TAG_LI +, GUMBO_TAG_FONT_FACE_URI , GUMBO_TAG_LAST -, GUMBO_TAG_TRACK -, GUMBO_TAG_TEXTAREA -, GUMBO_TAG_I -, GUMBO_TAG_MARK -, GUMBO_TAG_TD +, GUMBO_TAG_FILTER , GUMBO_TAG_LAST -, GUMBO_TAG_CENTER , GUMBO_TAG_LAST -, GUMBO_TAG_DIR -, GUMBO_TAG_FEFUNCB -, GUMBO_TAG_INPUT -, GUMBO_TAG_OBJECT -, GUMBO_TAG_NOEMBED , GUMBO_TAG_LAST -, GUMBO_TAG_MACTION +, GUMBO_TAG_SYMBOL +, GUMBO_TAG_RECT , GUMBO_TAG_BASEFONT -, GUMBO_TAG_MI , GUMBO_TAG_LAST -, GUMBO_TAG_MTD , GUMBO_TAG_BASE +, GUMBO_TAG_ISINDEX +, GUMBO_TAG_ACRONYM , GUMBO_TAG_LAST +, GUMBO_TAG_FEDISTANTLIGHT +, GUMBO_TAG_MFENCED +, GUMBO_TAG_MSGROUP +, GUMBO_TAG_OBJECT , GUMBO_TAG_LAST +, GUMBO_TAG_FEGAUSSIANBLUR , GUMBO_TAG_LAST -, GUMBO_TAG_DFN -, GUMBO_TAG_HR -, GUMBO_TAG_FESPECTACTUALRLIGHTING -, GUMBO_TAG_H6 -, GUMBO_TAG_ANIMATEMOTION -, GUMBO_TAG_DIV +, GUMBO_TAG_SCRIPT +, GUMBO_TAG_DEFS , GUMBO_TAG_LAST -, GUMBO_TAG_FONT_FACE_URI , GUMBO_TAG_LAST -, GUMBO_TAG_POLYLINE +, GUMBO_TAG_LINE , GUMBO_TAG_LAST -, GUMBO_TAG_H5 -, GUMBO_TAG_OUTPUT -, GUMBO_TAG_MSCARRIES -, GUMBO_TAG_INS -, GUMBO_TAG_H4 -, GUMBO_TAG_MSGROUP -, GUMBO_TAG_H3 -, GUMBO_TAG_MUNDER -, GUMBO_TAG_USE -, GUMBO_TAG_MSTACK -, GUMBO_TAG_MMULTISCRIPTS -, GUMBO_TAG_MUNDEROVER -, GUMBO_TAG_ANNOTATION_XML -, GUMBO_TAG_MASK -, GUMBO_TAG_CANVAS , GUMBO_TAG_LAST -, GUMBO_TAG_H2 -, GUMBO_TAG_SECTION -, GUMBO_TAG_MENU , GUMBO_TAG_LAST -, GUMBO_TAG_FILTER -, GUMBO_TAG_MATH -, GUMBO_TAG_HEADER +, GUMBO_TAG_LINEARGRADIENT , GUMBO_TAG_LAST -, GUMBO_TAG_NOBR -, GUMBO_TAG_CAPTION -, GUMBO_TAG_H1 +, GUMBO_TAG_POLYGON +, GUMBO_TAG_ELLIPSE , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_ADDRESS -, GUMBO_TAG_VKERN , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_CIRCLE -, GUMBO_TAG_VIDEO -, GUMBO_TAG_FONT_FACE_SRC -, GUMBO_TAG_CITE , GUMBO_TAG_LAST -, GUMBO_TAG_G -, GUMBO_TAG_POLYGON +, GUMBO_TAG_ANNOTATION_XML +, GUMBO_TAG_OL +, GUMBO_TAG_FEDISPLACEMENTMAP , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_ELLIPSE +, GUMBO_TAG_USE , GUMBO_TAG_LAST -, GUMBO_TAG_MSUP , GUMBO_TAG_LAST -, GUMBO_TAG_SOURCE -, GUMBO_TAG_MSUBSUP -, GUMBO_TAG_XMP -, GUMBO_TAG_FIGURE , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_FESPECTACTUALRLIGHTING +, GUMBO_TAG_OUTPUT , GUMBO_TAG_LAST +, GUMBO_TAG_FEFLOOD +, GUMBO_TAG_NOEMBED +, GUMBO_TAG_MSCARRIES +, GUMBO_TAG_MFRAC , GUMBO_TAG_LAST -, GUMBO_TAG_EMBED +, GUMBO_TAG_CIRCLE +, GUMBO_TAG_MMULTISCRIPTS , GUMBO_TAG_LAST -, GUMBO_TAG_FEDIFFUSELIGHTING -, GUMBO_TAG_OL +, GUMBO_TAG_CITE , GUMBO_TAG_LAST +, GUMBO_TAG_FEFUNCG , GUMBO_TAG_LAST +, GUMBO_TAG_FIGURE +, GUMBO_TAG_MLONGDIV +, GUMBO_TAG_MSUP +, GUMBO_TAG_FECOMPONENTTRANSFER +, GUMBO_TAG_CENTER +, GUMBO_TAG_MSUBSUP +, GUMBO_TAG_FECOMPOSITE , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_SYMBOL -, GUMBO_TAG_RTC -, GUMBO_TAG_PATH -, GUMBO_TAG_LINE +, GUMBO_TAG_NOBR +, GUMBO_TAG_SOURCE , GUMBO_TAG_LAST -, GUMBO_TAG_LINEARGRADIENT -, GUMBO_TAG_FEBLEND -, GUMBO_TAG_MPATH -, GUMBO_TAG_MPHANTOM -, GUMBO_TAG_DESC +, GUMBO_TAG_MACTION , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_HR +, GUMBO_TAG_MUNDER +, GUMBO_TAG_H6 +, GUMBO_TAG_SUP +, GUMBO_TAG_NAV +, GUMBO_TAG_MUNDEROVER +, GUMBO_TAG_CANVAS +, GUMBO_TAG_FEFUNCB +, GUMBO_TAG_TBODY +, GUMBO_TAG_PICTURE +, GUMBO_TAG_MENU +, GUMBO_TAG_MALIGNMARK +, GUMBO_TAG_TD +, GUMBO_TAG_SECTION +, GUMBO_TAG_CAPTION +, GUMBO_TAG_LAST +, GUMBO_TAG_H5 , GUMBO_TAG_LAST +, GUMBO_TAG_H4 , GUMBO_TAG_LAST +, GUMBO_TAG_H3 , GUMBO_TAG_LAST -, GUMBO_TAG_CODE +, GUMBO_TAG_HEADER +, GUMBO_TAG_MSCARRY +, GUMBO_TAG_G , GUMBO_TAG_LAST -, GUMBO_TAG_DD +, GUMBO_TAG_KEYGEN +, GUMBO_TAG_MATH +, GUMBO_TAG_MTD +, GUMBO_TAG_NEXTID +, GUMBO_TAG_H2 , GUMBO_TAG_LAST +, GUMBO_TAG_H1 +, GUMBO_TAG_SUMMARY , GUMBO_TAG_LAST -, GUMBO_TAG_B , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_DFN , GUMBO_TAG_LAST -, GUMBO_TAG_MSCARRY -, GUMBO_TAG_RB -, GUMBO_TAG_SUP , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_ADDRESS +, GUMBO_TAG_FIGCAPTION +, GUMBO_TAG_PATH +, GUMBO_TAG_SEARCH , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_FIGCAPTION , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_CURSOR +, GUMBO_TAG_MPATH , GUMBO_TAG_LAST -, GUMBO_TAG_SVG -, GUMBO_TAG_TBODY -, GUMBO_TAG_ABBR , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_COLOR_PROFILE -, GUMBO_TAG_LI +, GUMBO_TAG_BLOCKQUOTE +, GUMBO_TAG_LISTING , GUMBO_TAG_LAST -, GUMBO_TAG_MALIGNMARK , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_AUDIO +, GUMBO_TAG_MPHANTOM , GUMBO_TAG_LAST +, GUMBO_TAG_VIDEO , GUMBO_TAG_LAST +, GUMBO_TAG_VKERN , GUMBO_TAG_LAST +, GUMBO_TAG_EMBED , GUMBO_TAG_LAST -, GUMBO_TAG_NEXTID +, GUMBO_TAG_B , GUMBO_TAG_LAST -, GUMBO_TAG_BLOCKQUOTE +, GUMBO_TAG_FONT_FACE_SRC +, GUMBO_TAG_RB +, GUMBO_TAG_FECOLORMATRIX , GUMBO_TAG_DL , GUMBO_TAG_LAST -, GUMBO_TAG_SUMMARY -, GUMBO_TAG_U -, GUMBO_TAG_LAST +, GUMBO_TAG_FECONVOLVEMATRIX , GUMBO_TAG_LAST -, GUMBO_TAG_THEAD -, GUMBO_TAG_BDO +, GUMBO_TAG_COLOR_PROFILE +, GUMBO_TAG_DIALOG , GUMBO_TAG_LAST +, GUMBO_TAG_ABBR , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_HEAD , GUMBO_TAG_LAST -, GUMBO_TAG_FECOLORMATRIX , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_FECONVOLVEMATRIX , GUMBO_TAG_LAST , GUMBO_TAG_DEL -, GUMBO_TAG_DIALOG +, GUMBO_TAG_LAST +, GUMBO_TAG_SWITCH , GUMBO_TAG_LAST , GUMBO_TAG_COLGROUP +, GUMBO_TAG_FEDIFFUSELIGHTING +, GUMBO_TAG_IMG , GUMBO_TAG_LAST -, GUMBO_TAG_HTML +, GUMBO_TAG_DESC , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MISSING_GLYPH , GUMBO_TAG_LAST -, GUMBO_TAG_KEYGEN -, GUMBO_TAG_ALTGLYPH , GUMBO_TAG_LAST +, GUMBO_TAG_LINK +, GUMBO_TAG_GLYPHREF , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_FEBLEND , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_CODE , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_LEGEND +, GUMBO_TAG_TEXTPATH +, GUMBO_TAG_AUDIO +, GUMBO_TAG_BDO +, GUMBO_TAG_LABEL , GUMBO_TAG_LAST +, GUMBO_TAG_RTC , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MSUB , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_ALTGLYPH +, GUMBO_TAG_CURSOR +, GUMBO_TAG_U , GUMBO_TAG_LAST +, GUMBO_TAG_HTML , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MISSING_GLYPH -, GUMBO_TAG_IMG , GUMBO_TAG_LAST -, GUMBO_TAG_BUTTON , GUMBO_TAG_LAST +, GUMBO_TAG_UL +, GUMBO_TAG_DIV , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MSUB +, GUMBO_TAG_LEGEND , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_GLYPHREF +, GUMBO_TAG_BDI , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_BODY , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_DD +, GUMBO_TAG_MULTICOL , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_LISTING , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_LABEL +, GUMBO_TAG_SVG , GUMBO_TAG_LAST -, GUMBO_TAG_SWITCH -, GUMBO_TAG_HGROUP -, GUMBO_TAG_UL , GUMBO_TAG_LAST +, GUMBO_TAG_COL , GUMBO_TAG_LAST +, GUMBO_TAG_HEAD +, GUMBO_TAG_THEAD , GUMBO_TAG_LAST -, GUMBO_TAG_TEXTPATH , GUMBO_TAG_LAST -, GUMBO_TAG_BODY , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_BUTTON +, GUMBO_TAG_RUBY , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_ISINDEX , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_COL , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MULTICOL , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -569,10 +558,10 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_BGSOUND , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_HGROUP , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -582,10 +571,8 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_RUBY , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_TH , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -597,7 +584,6 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_LINK , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -605,12 +591,12 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_BDI +, GUMBO_TAG_BLINK , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_HKERN +, GUMBO_TAG_BGSOUND , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -628,13 +614,13 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_HKERN , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_MGLYPH , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -647,6 +633,7 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_BIG , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -669,9 +656,9 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_MGLYPH , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_SUB , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -680,8 +667,8 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_TH , GUMBO_TAG_LAST -, GUMBO_TAG_BIG , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -691,9 +678,9 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_KBD , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_SUB , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -719,7 +706,6 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_BLINK , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -729,6 +715,7 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_GLYPH , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -744,6 +731,7 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST +, GUMBO_TAG_KBD , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST @@ -752,9 +740,8 @@ static const GumboTag kGumboTagMap[] = { , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_CLIPPATH , GUMBO_TAG_LAST , GUMBO_TAG_LAST , GUMBO_TAG_LAST -, GUMBO_TAG_GLYPH +, GUMBO_TAG_CLIPPATH }; diff --git a/src/tag_sizes.h b/src/tag_sizes.h index c9710ef..1862104 100644 --- a/src/tag_sizes.h +++ b/src/tag_sizes.h @@ -197,6 +197,7 @@ 5, 4, 7, +7, 9, 7, 8, @@ -213,10 +214,12 @@ 1, 4, 6, +6, 7, 6, 9, 3, +4, 5, 6, 6, diff --git a/src/tag_strings.h b/src/tag_strings.h index fece598..8c6af23 100644 --- a/src/tag_strings.h +++ b/src/tag_strings.h @@ -197,6 +197,7 @@ "param", "path", "pattern", +"picture", "plaintext", "polygon", "polyline", @@ -213,10 +214,12 @@ "s", "samp", "script", +"search", "section", "select", "semantics", "set", +"slot", "small", "source", "spacer", diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..e00a71b --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,32 @@ +# Build examples/clean_text to ensure c++ code can +# be linked to newly built gumbo shared library. + +cmake_minimum_required(VERSION 3.0) + + +set( test_executable run_tests ) + +if( APPLE ) + set(CMAKE_MACOSX_RPATH 1) +endif() + +if( UNIX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") +endif() + +set(CXX_SOURCES attribute.cc + char_ref.cc + parser.cc + string_buffer.cc + string_piece.cc + test_utils.cc + tokenizer.cc + utf8.cc + vector.cc ) + + +target_include_directories(${PROJECT_NAME} BEFORE PUBLIC ${GUMBO_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}) + +add_executable(${test_executable} ${CXX_SOURCES} ) +target_link_libraries( ${test_executable} ${GUMBO_LIBRARIES} GTest::GTest GTest::Main) + diff --git a/tests/tokenizer.cc b/tests/tokenizer.cc index a76fbed..40e6180 100644 --- a/tests/tokenizer.cc +++ b/tests/tokenizer.cc @@ -54,7 +54,7 @@ class GumboTokenizerTest : public GumboTest { }; TEST(GumboTagEnumTest, TagEnumIncludesAllTags) { - EXPECT_EQ(256, GUMBO_TAG_UNKNOWN); + EXPECT_EQ(259, GUMBO_TAG_UNKNOWN); EXPECT_STREQ("", kGumboTagNames[GUMBO_TAG_UNKNOWN]); }