Skip to content

Commit

Permalink
Merge pull request #273 from nostrademons/cdata
Browse files Browse the repository at this point in the history
Handle CDATA properly
  • Loading branch information
nostrademons committed Feb 17, 2015
2 parents 12ce725 + ece6a44 commit 7a55fdc
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 16 deletions.
3 changes: 3 additions & 0 deletions DEBUGGING.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ $ gdb .libs/lt-gumbo_test core

The same goes for core dumps in other example binaries.

To run only a single unit test, pass the --gtest_filter='TestName' flag to the
lt-gumbo_test binary.

Assertions
==========

Expand Down
1 change: 1 addition & 0 deletions src/error.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ static void handle_parser_error(GumboParser* parser,
// But just in case...
print_message(parser, output, "Comments aren't legal here");
return;
case GUMBO_TOKEN_CDATA:
case GUMBO_TOKEN_WHITESPACE:
case GUMBO_TOKEN_CHARACTER:
print_message(parser, output, "Character tokens aren't legal here");
Expand Down
16 changes: 11 additions & 5 deletions src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ typedef struct _TextNodeBufferState {
// The source position of the start of this text node.
GumboSourcePosition _start_position;

// The type of node that will be inserted (TEXT or WHITESPACE).
// The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
GumboNodeType _type;
} TextNodeBufferState;

Expand Down Expand Up @@ -793,7 +793,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
}

assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
buffer_state->_type == GUMBO_NODE_TEXT);
buffer_state->_type == GUMBO_NODE_TEXT ||
buffer_state->_type == GUMBO_NODE_CDATA);
GumboNode* text_node = create_node(parser, buffer_state->_type);
GumboText* text_node_data = &text_node->v.text;
text_node_data->text = gumbo_string_buffer_to_string(
Expand Down Expand Up @@ -1019,7 +1020,9 @@ static GumboNode* insert_foreign_element(

static void insert_text_token(GumboParser* parser, GumboToken* token) {
assert(token->type == GUMBO_TOKEN_WHITESPACE ||
token->type == GUMBO_TOKEN_CHARACTER);
token->type == GUMBO_TOKEN_CHARACTER ||
token->type == GUMBO_TOKEN_NULL ||
token->type == GUMBO_TOKEN_CDATA);
TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
if (buffer_state->_buffer.length == 0) {
// Initialize position fields.
Expand All @@ -1030,6 +1033,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
parser, token->v.character, &buffer_state->_buffer);
if (token->type == GUMBO_TOKEN_CHARACTER) {
buffer_state->_type = GUMBO_NODE_TEXT;
} else if (token->type == GUMBO_TOKEN_CDATA) {
buffer_state->_type = GUMBO_NODE_CDATA;
}
gumbo_debug("Inserting text token '%c'.\n", token->v.character);
}
Expand Down Expand Up @@ -2207,7 +2212,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
reconstruct_active_formatting_elements(parser);
insert_text_token(parser, token);
return true;
} else if (token->type == GUMBO_TOKEN_CHARACTER) {
} else if (token->type == GUMBO_TOKEN_CHARACTER ||
token->type == GUMBO_TOKEN_CDATA) {
reconstruct_active_formatting_elements(parser);
insert_text_token(parser, token);
set_frameset_not_ok(parser);
Expand Down Expand Up @@ -3485,13 +3491,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
switch (token->type) {
case GUMBO_TOKEN_NULL:
parser_add_parse_error(parser, token);
token->type = GUMBO_TOKEN_CHARACTER;
token->v.character = kUtf8ReplacementChar;
insert_text_token(parser, token);
return false;
case GUMBO_TOKEN_WHITESPACE:
insert_text_token(parser, token);
return true;
case GUMBO_TOKEN_CDATA:
case GUMBO_TOKEN_CHARACTER:
insert_text_token(parser, token);
set_frameset_not_ok(parser);
Expand Down
1 change: 1 addition & 0 deletions src/token_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ typedef enum {
GUMBO_TOKEN_COMMENT,
GUMBO_TOKEN_WHITESPACE,
GUMBO_TOKEN_CHARACTER,
GUMBO_TOKEN_CDATA,
GUMBO_TOKEN_NULL,
GUMBO_TOKEN_EOF
} GumboTokenType;
Expand Down
18 changes: 15 additions & 3 deletions src/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ typedef struct GumboInternalTokenizerState {
// markup declaration state.
bool _is_current_node_foreign;

// A flag indicating whether the tokenizer is in a CDATA section. If so, then
// text tokens emitted will be GUMBO_TOKEN_CDATA.
bool _is_in_cdata;

// Certain states (notably character references) may emit two character tokens
// at once, but the contract for lex() fills in only one token at a time. The
// extra character is buffered here, and then this is checked on entry to
Expand Down Expand Up @@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
}

static GumboTokenType get_char_token_type(int c) {
static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
if (is_in_cdata && c > 0) {
return GUMBO_TOKEN_CDATA;
}

switch (c) {
case '\t':
case '\n':
Expand Down Expand Up @@ -475,7 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) {

// Writes a single specified character to the output token.
static void emit_char(GumboParser* parser, int c, GumboToken* output) {
output->type = get_char_token_type(c);
output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
output->v.character = c;
finish_token(parser, output);
}
Expand Down Expand Up @@ -850,6 +858,7 @@ void gumbo_tokenizer_state_init(
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
tokenizer->_reconsume_current_input = false;
tokenizer->_is_current_node_foreign = false;
tokenizer->_is_in_cdata = false;
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;

tokenizer->_buffered_emit_char = kGumboNoChar;
Expand Down Expand Up @@ -2041,6 +2050,7 @@ static StateResult handle_markup_declaration_state(
utf8iterator_maybe_consume_match(
&tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
tokenizer->_is_in_cdata = true;
tokenizer->_reconsume_current_input = true;
} else {
tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
Expand Down Expand Up @@ -2814,6 +2824,7 @@ static StateResult handle_cdata_state(
tokenizer->_reconsume_current_input = true;
reset_token_start_point(tokenizer);
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
tokenizer->_is_in_cdata = true;
return NEXT_CHAR;
} else {
return emit_current_char(parser, output);
Expand Down Expand Up @@ -2930,7 +2941,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
assert(!tokenizer->_temporary_buffer_emit);
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
int c = utf8iterator_current(&tokenizer->_input);
gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
gumbo_debug("Lexing character '%c' (%d) in state %d.\n",
c, c, tokenizer->_state);
StateResult result =
dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
// We need to clear reconsume_current_input before returning to prevent
Expand Down
17 changes: 9 additions & 8 deletions src/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ static void read_char(Utf8Iterator* iter) {
decode(&state, &code_point, (uint32_t) (unsigned char) (*c));
if (state == UTF8_ACCEPT) {
iter->_width = c - iter->_start + 1;
// This is the special handling for carriage returns that is mandated by the
// HTML5 spec. Since we're looking for particular 7-bit literal characters,
// we operate in terms of chars and only need a check for iter overrun,
// instead of having to read in a full next code point.
// This is the special handling for carriage returns that is mandated by
// the HTML5 spec. Since we're looking for particular 7-bit literal
// characters, we operate in terms of chars and only need a check for iter
// overrun, instead of having to read in a full next code point.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
if (code_point == '\r') {
assert(iter->_width == 1);
Expand Down Expand Up @@ -165,10 +165,11 @@ static void read_char(Utf8Iterator* iter) {
return;
}
}
// If we got here without exiting early, then we've reached the end of the iterator.
// Add an error for truncated input, set the width to consume the rest of the
// iterator, and emit a replacement character. The next time we enter this method,
// it will detect that there's no input to consume and
// If we got here without exiting early, then we've reached the end of the
// iterator. Add an error for truncated input, set the width to consume the
// rest of the iterator, and emit a replacement character. The next time we
// enter this method, it will detect that there's no input to consume and
// output an EOF.
iter->_current = kUtf8ReplacementChar;
iter->_width = iter->_end - iter->_start;
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
Expand Down
51 changes: 51 additions & 0 deletions tests/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,57 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) {
ASSERT_EQ(1, GetChildCount(li2));
}

TEST_F(GumboParserTest, CData) {
Parse("<svg><![CDATA[this is text]]></svg>");

GumboNode* body;
GetAndAssertBody(root_, &body);
ASSERT_EQ(1, GetChildCount(body));

GumboNode* svg = GetChild(body, 0);
ASSERT_EQ(1, GetChildCount(svg));

GumboNode* cdata = GetChild(svg, 0);
ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
EXPECT_STREQ("this is text", cdata->v.text.text);
}

TEST_F(GumboParserTest, CDataUnsafe) {
// Can't use Parse() because of the strlen
output_ = gumbo_parse_with_options(
&options_, "<svg><![CDATA[\0filler\0text\0]]>",
sizeof("<svg><![CDATA[\0filler\0text\0]]>") - 1);
root_ = output_->document;

GumboNode* body;
GetAndAssertBody(root_, &body);
ASSERT_EQ(1, GetChildCount(body));

GumboNode* svg = GetChild(body, 0);
ASSERT_EQ(1, GetChildCount(svg));

GumboNode* cdata = GetChild(svg, 0);
ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
// \xEF\xBF\xBD = unicode replacement char
EXPECT_STREQ("\xEF\xBF\xBD" "filler\xEF\xBF\xBD" "text\xEF\xBF\xBD",
cdata->v.text.text);
}

TEST_F(GumboParserTest, CDataInBody) {
Parse("<div><![CDATA[this is text]]></div>");

GumboNode* body;
GetAndAssertBody(root_, &body);
ASSERT_EQ(1, GetChildCount(body));

GumboNode* div = GetChild(body, 0);
ASSERT_EQ(1, GetChildCount(div));

GumboNode* cdata = GetChild(div, 0);
ASSERT_EQ(GUMBO_NODE_COMMENT, cdata->type);
EXPECT_STREQ("[CDATA[this is text]]", cdata->v.text.text);
}

TEST_F(GumboParserTest, FormattingTagsInHeading) {
Parse("<h2>This is <b>old</h2>text");

Expand Down
18 changes: 18 additions & 0 deletions tests/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,24 @@ TEST_F(GumboTokenizerTest, ScriptDoubleEscaped) {
EXPECT_EQ('>', token_.v.character);
}

TEST_F(GumboTokenizerTest, CData) {
// SetInput uses strlen and so can't handle nulls.
text_ = "<![CDATA[\0filler\0text\0]]>";
gumbo_tokenizer_state_destroy(&parser_);
gumbo_tokenizer_state_init(
&parser_, text_, sizeof("<![CDATA[\0filler\0text\0]]>") - 1);
gumbo_tokenizer_set_is_current_node_foreign(&parser_, true);

EXPECT_TRUE(gumbo_lex(&parser_, &token_));
EXPECT_EQ(GUMBO_TOKEN_NULL, token_.type);
EXPECT_EQ(0, token_.v.character);

gumbo_token_destroy(&parser_, &token_);
EXPECT_TRUE(gumbo_lex(&parser_, &token_));
EXPECT_EQ(GUMBO_TOKEN_CDATA, token_.type);
EXPECT_EQ('f', token_.v.character);
}

TEST_F(GumboTokenizerTest, StyleHasTagEmbedded) {
SetInput("<style>/* For <head> */</style>");
Advance(1);
Expand Down
15 changes: 15 additions & 0 deletions tests/utf8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,21 @@ TEST_F(Utf8Test, MatchesCaseInsensitive) {
EXPECT_EQ(-1, utf8iterator_current(&input_));
}

TEST_F(Utf8Test, MatchFollowedByNullByte) {
// Can't use ResetText, as the implicit strlen will choke on the null.
text_ = "CDATA\0f";
utf8iterator_init(&parser_, text_, 7, &input_);

EXPECT_TRUE(utf8iterator_maybe_consume_match(
&input_, "cdata", sizeof("cdata") - 1, false));

EXPECT_EQ(0, utf8iterator_current(&input_));
EXPECT_EQ('\0', *utf8iterator_get_char_pointer(&input_));
utf8iterator_next(&input_);
EXPECT_EQ('f', utf8iterator_current(&input_));
EXPECT_EQ('f', *utf8iterator_get_char_pointer(&input_));
}

TEST_F(Utf8Test, MarkReset) {
ResetText("this is a test");
Advance(5);
Expand Down

0 comments on commit 7a55fdc

Please sign in to comment.