From 2ad85e5b30586902828b3eacc9453ac27d87a443 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Sat, 22 Sep 2018 18:36:15 -0700 Subject: [PATCH] Load page subresources from Wayback Machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a setting (on by default) to transform subresource URLs (styles, scripts, images, etc.) such that they load from the Internet Archive’s Wayback Machine instead of from their original URL, which might be either missing or different from what it was at the time you are looking at a snapshot of. This also handily works around some (but not all) of the URL security issues in edgi-govdata-archiving/web-monitoring#92. --- src/components/change-view.jsx | 1 + src/components/diff-settings-form.jsx | 46 +++++++--- src/components/diff-view.jsx | 20 +++-- src/components/inline-rendered-diff.jsx | 11 ++- src/components/side-by-side-rendered-diff.jsx | 23 ++++- src/scripts/html-transforms.js | 89 +++++++++++++++++++ 6 files changed, 164 insertions(+), 26 deletions(-) diff --git a/src/components/change-view.jsx b/src/components/change-view.jsx index a36671d5..c1ca0133 100644 --- a/src/components/change-view.jsx +++ b/src/components/change-view.jsx @@ -66,6 +66,7 @@ export default class ChangeView extends React.Component { collapsedView: true, diffSettings: { removeFormatting: false, + useWaybackResources: true, }, diffType: undefined, updating: false, diff --git a/src/components/diff-settings-form.jsx b/src/components/diff-settings-form.jsx index 28cbc8c2..5b77efaa 100644 --- a/src/components/diff-settings-form.jsx +++ b/src/components/diff-settings-form.jsx @@ -3,6 +3,9 @@ import React from 'react'; // Diff types that we can remove formatting from const typesWithFormatting = ['SIDE_BY_SIDE_RENDERED', 'HIGHLIGHTED_RENDERED']; +// Shallow-merge multiple objects +const mergeObjects = (...objects) => Object.assign({}, ...objects); + /** * @typedef DiffSettingsFormProps * @property {string} diffType The current diff type to render controls for @@ -20,7 +23,7 @@ export default class DiffSettingsForm extends React.PureComponent { constructor (props) { super(props); - this._handleRemoveFormattingChange = this._handleRemoveFormattingChange.bind(this); + this._handleCheckboxChange = this._handleCheckboxChange.bind(this); } render () { @@ -29,21 +32,36 @@ export default class DiffSettingsForm extends React.PureComponent { } return ( - +
+ + + +
); } - _handleRemoveFormattingChange (event) { - this.props.onChange({ - removeFormatting: event.target.checked - }); + _handleCheckboxChange (event) { + const field = event.target.name; + this.props.onChange(mergeObjects(this.props.settings, { + [field]: event.target.checked + })); } } diff --git a/src/components/diff-view.jsx b/src/components/diff-view.jsx index 64176751..566a6e8b 100644 --- a/src/components/diff-view.jsx +++ b/src/components/diff-view.jsx @@ -117,13 +117,19 @@ export default class DiffView extends React.Component { } renderDiff () { + const commonProps = { + page: this.props.page, + a: this.props.a, + b: this.props.b, + diffData: this.state.diffData + }; // TODO: if we have multiple ways to render content from a single service // in the future (e.g. inline vs. side-by-side text), we need a better // way to ensure we use the correct rendering and avoid race conditions switch (this.props.diffType) { case diffTypes.RAW_SIDE_BY_SIDE.value: return ( - + ); case diffTypes.RAW_FROM_CONTENT.value: return ( @@ -135,17 +141,19 @@ export default class DiffView extends React.Component { ); case diffTypes.HIGHLIGHTED_RENDERED.value: return ( - + ); case diffTypes.SIDE_BY_SIDE_RENDERED.value: return ( - + ); case diffTypes.OUTGOING_LINKS.value: return ( - + ); case diffTypes.HIGHLIGHTED_TEXT.value: return ( diff --git a/src/components/inline-rendered-diff.jsx b/src/components/inline-rendered-diff.jsx index 11e0f093..a450cd6c 100644 --- a/src/components/inline-rendered-diff.jsx +++ b/src/components/inline-rendered-diff.jsx @@ -1,5 +1,5 @@ import React from 'react'; -import {removeStyleAndScript} from '../scripts/html-transforms'; +import {removeStyleAndScript, loadSubresourcesFromWayback, compose} from '../scripts/html-transforms'; import SandboxedHtml from './sandboxed-html'; /** @@ -7,6 +7,7 @@ import SandboxedHtml from './sandboxed-html'; * @property {DiffData} diffData Object containing diff to render and its metadata * @property {Page} page The page this diff pertains to * @property {boolean} removeFormatting + * @property {boolean} useWaybackResources */ /** @@ -19,7 +20,13 @@ import SandboxedHtml from './sandboxed-html'; export default class InlineRenderedDiff extends React.Component { render () { const diff = this.props.diffData.combined || this.props.diffData.diff; - const transformDocument = this.props.removeFormatting && removeStyleAndScript; + const transformDocument = compose( + this.props.removeFormatting && removeStyleAndScript, + this.props.useWaybackResources && loadSubresourcesFromWayback( + this.props.page, + diff + ) + ); return (
diff --git a/src/components/side-by-side-rendered-diff.jsx b/src/components/side-by-side-rendered-diff.jsx index 08186170..de687e80 100644 --- a/src/components/side-by-side-rendered-diff.jsx +++ b/src/components/side-by-side-rendered-diff.jsx @@ -1,12 +1,15 @@ import React from 'react'; -import {removeStyleAndScript} from '../scripts/html-transforms'; +import {removeStyleAndScript, loadSubresourcesFromWayback, compose} from '../scripts/html-transforms'; import SandboxedHtml from './sandboxed-html'; /** * @typedef {Object} SideBySideRenderedDiffProps * @property {DiffData} diffData Object containing diff to render and its metadata * @property {Page} page The page this diff pertains to + * @property {Version} a The "A" version of the page this diff pertains to + * @property {Version} b The "B" version of the page this diff pertains to * @property {boolean} removeFormatting + * @proprety {boolean} useWaybackResources */ /** @@ -18,19 +21,31 @@ import SandboxedHtml from './sandboxed-html'; */ export default class SideBySideRenderedDiff extends React.Component { render () { - const transformDocument = this.props.removeFormatting && removeStyleAndScript; + const baseTransform = this.props.removeFormatting && removeStyleAndScript; + let transformA = baseTransform; + let transformB = baseTransform; + if (this.props.useWaybackResources) { + transformA = compose(transformA, loadSubresourcesFromWayback( + this.props.page, + this.props.a + )); + transformB = compose(transformB, loadSubresourcesFromWayback( + this.props.page, + this.props.b + )); + } return (
); diff --git a/src/scripts/html-transforms.js b/src/scripts/html-transforms.js index b674e02a..0c3a6903 100644 --- a/src/scripts/html-transforms.js +++ b/src/scripts/html-transforms.js @@ -1,3 +1,16 @@ +export function compose (...transforms) { + transforms = transforms.filter(transform => !!transform); + if (transforms.length === 0) { + return x => x; + } + + return (input) => { + transforms.reduce((output, transform) => { + return transform(output); + }, input); + }; +} + export function removeStyleAndScript (document) { // Stylesheets and scripts document.querySelectorAll('link[rel="stylesheet"], style, script').forEach(node => { @@ -14,3 +27,79 @@ export function removeStyleAndScript (document) { return document; } + +/** + * + * @param {WebMonitoringDb.Page} page + * @param {WebMonitoringDb.Version} version + */ +export function loadSubresourcesFromWayback (page, version) { + return document => { + const timestamp = createWaybackTimestamp(version.capture_time); + document.querySelectorAll('link[rel="stylesheet"]').forEach(node => { + node.href = createWaybackUrl(node.getAttribute('href'), timestamp, page.url); + }); + + return document; + }; +} + + +// ---------------------- Support Functions ----------------------------- + +/** + * Convert a Date object to to a Wayback-Machine style timestamp string. + * @param {Date} date A JS date object to convert + * @returns String + */ +function createWaybackTimestamp (date) { + return '' + date.getUTCFullYear() + + twoDigit(date.getUTCMonth() + 1) + + twoDigit(date.getUTCDate()) + + twoDigit(date.getUTCHours()) + + twoDigit(date.getUTCMinutes()) + + twoDigit(date.getUTCSeconds()); +} + +const PROTOCOL_PATTERN = /^[^/]+:\/\//; + +/** + * Create a URL that points to a Wayback Machine-archived version of another + * URL near a particular date. + * @param {String} originalUrl URL of the resource to get from the Wayback Machine + * @param {Date|String} timestamp Date of the + */ +function createWaybackUrl (originalUrl, timestamp, baseUrl) { + if (typeof timestamp !== 'string') { + timestamp = createWaybackTimestamp(timestamp); + } + + const url = resolveUrl(originalUrl, baseUrl); + return `https://web.archive.org/web/${timestamp}id_/${url}`; +} + +/** + * Resolve a full URL from a relative one. + * @param {String} url The URL to resolve + * @param {String} baseUrl The base URL to resolve from + */ +function resolveUrl (url, baseUrl) { + if (url.startsWith('//')) { + return `https:${url}`; + } + else if (!PROTOCOL_PATTERN.test(url)) { + const base = new URL(baseUrl); + if (url.startsWith('/')) { + return `${base.origin}${url}`; + } + else { + const path = base.pathname.split('/').slice(0, -1).join('/'); + return `${base.origin}${path}/${url}`; + } + } + return url; +} + +function twoDigit (number) { + return number.toString().padStart(2, '0'); +}