Skip to content

Commit

Permalink
Encode html entities for attribute updates (#406)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0ph1e authored May 3, 2020
1 parent 9c9985b commit c565e14
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 4 deletions.
4 changes: 3 additions & 1 deletion lib/resource-handler/html/html-source-element.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ class HtmlSourceElement {
* @param {string} newData
*/
setData (newData) {
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
// todo: encode can be removed after https://github.com/cheeriojs/cheerio/issues/957 fixed
const escapedData = utils.encodeHtmlEntities(newData);
this.rule.attr ? this.el.attr(this.rule.attr, escapedData) : this.el.text(newData);
}

removeIntegrityCheck () {
Expand Down
5 changes: 5 additions & 0 deletions lib/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ function decodeHtmlEntities (text) {
return typeof text === 'string' ? htmlEntities.decode(text) : '';
}

function encodeHtmlEntities (text) {
return typeof text === 'string' ? htmlEntities.escape(text) : '';
}

function clone (obj) {
return Object.assign({}, obj);
}
Expand Down Expand Up @@ -188,6 +192,7 @@ module.exports = {
getTypeByMime,
getTypeByFilename,
decodeHtmlEntities,
encodeHtmlEntities,
clone,
extend,
union,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ var scrape = require('../../../index');
var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';

describe('Functional: html entities in url', function() {
describe('Functional: html entities', function() {

beforeEach(function() {
nock.cleanAll();
Expand All @@ -27,7 +27,9 @@ describe('Functional: html entities in url', function() {
// /fonts?family=Myriad&v=2 => /fonts?family=Myriad&v=2
nock('http://example.com/').get('/fonts?family=Myriad&v=2').reply(200, 'fonts.css', {'content-type': 'text/css'});
// /?a=1&style-attr.png => /?a=1&style-attr.png
nock('http://example.com/').get('/style-attr.png?a=1&style-attr.png').reply(200, 'style-attr.png', {'content-type': 'text/css'});
nock('http://example.com/').get('/style-attr.png?a=1&style-attr.png').reply(200, 'style-attr.png');
// "style-attr2.png" => style-attr2.png
nock('http://example.com/').get('/style-attr2.png').reply(200, 'style-attr2.png');
// /?a=1&b=2 => /?a=1&b=2
nock('http://example.com/').get('/img.png?a=1&b=2').reply(200, 'img.png');
// /test?b=2&c=3&d=4 => /test?b=2&c=3&d=4
Expand Down Expand Up @@ -56,10 +58,16 @@ describe('Functional: html entities in url', function() {
fs.existsSync(testDirname + '/local/fonts.css').should.be.eql(true);
should(fs.readFileSync(testDirname + '/local/fonts.css').toString()).be.eql('fonts.css');

should(indexHtml).containEql('background: url(\'local/style-attr.png\')');
// single quote (') replaced with ' in attribute
should(indexHtml).containEql('background: url('local/style-attr.png')');
fs.existsSync(testDirname + '/local/style-attr.png').should.be.eql(true);
should(fs.readFileSync(testDirname + '/local/style-attr.png').toString()).be.eql('style-attr.png');

// double quote (") replaced with " in attribute
should(indexHtml).containEql('background: url("local/style-attr2.png")');
fs.existsSync(testDirname + '/local/style-attr2.png').should.be.eql(true);
should(fs.readFileSync(testDirname + '/local/style-attr2.png').toString()).be.eql('style-attr2.png');

should(indexHtml).containEql('img src="local/img.png');
fs.existsSync(testDirname + '/local/img.png').should.be.eql(true);
should(fs.readFileSync(testDirname + '/local/img.png').toString()).be.eql('img.png');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
</head>
<body>
<div style="background: url('http://example.com/style-attr.png?a=1&amp;style-attr.png')"></div>
<div style="background: url(&quot;http://example.com/style-attr2.png&quot;)"></div>
<img src="http://example.com/img.png?a=1&amp;b=2" />
<a href="?b=2&amp;c=3&amp;d=4">test</a>
</body>
Expand Down
23 changes: 23 additions & 0 deletions test/unit/resource-handler/html.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -267,4 +267,27 @@ describe('ResourceHandler: Html', () => {
resource.getText().should.not.containEql('integrity="sha256-X+Q/xqnlEgxCczSjjpp2AUGGgqM5gcBzhRQ0p+EAUEk="');
});
});

it('should use html entities for updated attributes', () => {
const sources = [
{ selector: '[style]', attr: 'style' },
];
downloadChildrenPaths.onFirstCall().resolves('width: 300px; height: 300px; background-image:url("./images/cat.jpg")');
htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths});

const html = `
<html>
<body>
<div style="width: 300px; height: 300px; background-image:url(&quot;http://example.com/cat.jpg&quot;)"></div>
</body>
</html>
`;

const resource = new Resource('http://example.com', 'index.html');
resource.setText(html);

return htmlHandler.handle(resource).then(() => {
resource.getText().should.containEql('style="width: 300px; height: 300px; background-image:url(&quot;./images/cat.jpg&quot;)"');
});
});
});

0 comments on commit c565e14

Please sign in to comment.