Skip to content

Commit

Permalink
Load page subresources from Wayback Machine
Browse files Browse the repository at this point in the history
Add a setting (on by default) to transform subresource URLs (styles, scripts, images, etc.) such that they load from the Internet Archive’s Wayback Machine instead of from their original URL, which might be either missing or different from what it was at the time you are looking at a snapshot of. This also handily works around some (but not all) of the URL security issues in edgi-govdata-archiving/web-monitoring#92.
  • Loading branch information
Mr0grog committed Sep 23, 2018
1 parent 78beb63 commit 2ad85e5
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 26 deletions.
1 change: 1 addition & 0 deletions src/components/change-view.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export default class ChangeView extends React.Component {
collapsedView: true,
diffSettings: {
removeFormatting: false,
useWaybackResources: true,
},
diffType: undefined,
updating: false,
Expand Down
46 changes: 32 additions & 14 deletions src/components/diff-settings-form.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import React from 'react';
// Diff types that we can remove formatting from
const typesWithFormatting = ['SIDE_BY_SIDE_RENDERED', 'HIGHLIGHTED_RENDERED'];

// Shallow-merge multiple objects
const mergeObjects = (...objects) => Object.assign({}, ...objects);

/**
* @typedef DiffSettingsFormProps
* @property {string} diffType The current diff type to render controls for
Expand All @@ -20,7 +23,7 @@ export default class DiffSettingsForm extends React.PureComponent {
constructor (props) {
super(props);

this._handleRemoveFormattingChange = this._handleRemoveFormattingChange.bind(this);
this._handleCheckboxChange = this._handleCheckboxChange.bind(this);
}

render () {
Expand All @@ -29,21 +32,36 @@ export default class DiffSettingsForm extends React.PureComponent {
}

return (
<label className="utilities__label">
<input
checked={this.props.settings.removeFormatting}
className="utilities__input"
onChange={this._handleRemoveFormattingChange}
type="checkbox">
</input>
Remove formatting
</label>
<form>
<label className="utilities__label">
<input
checked={this.props.settings.removeFormatting}
className="utilities__input"
name="removeFormatting"
onChange={this._handleCheckboxChange}
type="checkbox">
</input>
Remove formatting
</label>

<label className="utilities__label">
<input
checked={this.props.settings.useWaybackResources}
className="utilities__input"
name="useWaybackResources"
onChange={this._handleCheckboxChange}
type="checkbox">
</input>
Load Resources from Wayback Machine
</label>
</form>
);
}

_handleRemoveFormattingChange (event) {
this.props.onChange({
removeFormatting: event.target.checked
});
_handleCheckboxChange (event) {
const field = event.target.name;
this.props.onChange(mergeObjects(this.props.settings, {
[field]: event.target.checked
}));
}
}
20 changes: 14 additions & 6 deletions src/components/diff-view.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,19 @@ export default class DiffView extends React.Component {
}

renderDiff () {
const commonProps = {
page: this.props.page,
a: this.props.a,
b: this.props.b,
diffData: this.state.diffData
};
// TODO: if we have multiple ways to render content from a single service
// in the future (e.g. inline vs. side-by-side text), we need a better
// way to ensure we use the correct rendering and avoid race conditions
switch (this.props.diffType) {
case diffTypes.RAW_SIDE_BY_SIDE.value:
return (
<SideBySideRawVersions page={this.props.page} a={this.props.a} b={this.props.b} diffData={this.state.diffData} />
<SideBySideRawVersions {...commonProps} />
);
case diffTypes.RAW_FROM_CONTENT.value:
return (
Expand All @@ -135,17 +141,19 @@ export default class DiffView extends React.Component {
);
case diffTypes.HIGHLIGHTED_RENDERED.value:
return (
<InlineRenderedDiff diffData={this.state.diffData} page={this.props.page}
removeFormatting={this.props.diffSettings.removeFormatting}/>
<InlineRenderedDiff {...commonProps}
removeFormatting={this.props.diffSettings.removeFormatting}
useWaybackResources={this.props.diffSettings.useWaybackResources} />
);
case diffTypes.SIDE_BY_SIDE_RENDERED.value:
return (
<SideBySideRenderedDiff diffData={this.state.diffData} page={this.props.page}
removeFormatting={this.props.diffSettings.removeFormatting}/>
<SideBySideRenderedDiff {...commonProps}
removeFormatting={this.props.diffSettings.removeFormatting}
useWaybackResources={this.props.diffSettings.useWaybackResources} />
);
case diffTypes.OUTGOING_LINKS.value:
return (
<InlineRenderedDiff diffData={this.state.diffData} page={this.props.page} />
<InlineRenderedDiff {...commonProps} />
);
case diffTypes.HIGHLIGHTED_TEXT.value:
return (
Expand Down
11 changes: 9 additions & 2 deletions src/components/inline-rendered-diff.jsx
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import React from 'react';
import {removeStyleAndScript} from '../scripts/html-transforms';
import {removeStyleAndScript, loadSubresourcesFromWayback, compose} from '../scripts/html-transforms';
import SandboxedHtml from './sandboxed-html';

/**
* @typedef {Object} InlineRenderedDiffProps
* @property {DiffData} diffData Object containing diff to render and its metadata
* @property {Page} page The page this diff pertains to
* @property {boolean} removeFormatting
* @property {boolean} useWaybackResources
*/

/**
Expand All @@ -19,7 +20,13 @@ import SandboxedHtml from './sandboxed-html';
export default class InlineRenderedDiff extends React.Component {
render () {
const diff = this.props.diffData.combined || this.props.diffData.diff;
const transformDocument = this.props.removeFormatting && removeStyleAndScript;
const transformDocument = compose(
this.props.removeFormatting && removeStyleAndScript,
this.props.useWaybackResources && loadSubresourcesFromWayback(
this.props.page,
diff
)
);

return (
<div className="inline-render">
Expand Down
23 changes: 19 additions & 4 deletions src/components/side-by-side-rendered-diff.jsx
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import React from 'react';
import {removeStyleAndScript} from '../scripts/html-transforms';
import {removeStyleAndScript, loadSubresourcesFromWayback, compose} from '../scripts/html-transforms';
import SandboxedHtml from './sandboxed-html';

/**
* @typedef {Object} SideBySideRenderedDiffProps
* @property {DiffData} diffData Object containing diff to render and its metadata
* @property {Page} page The page this diff pertains to
* @property {Version} a The "A" version of the page this diff pertains to
* @property {Version} b The "B" version of the page this diff pertains to
* @property {boolean} removeFormatting
* @proprety {boolean} useWaybackResources
*/

/**
Expand All @@ -18,19 +21,31 @@ import SandboxedHtml from './sandboxed-html';
*/
export default class SideBySideRenderedDiff extends React.Component {
render () {
const transformDocument = this.props.removeFormatting && removeStyleAndScript;
const baseTransform = this.props.removeFormatting && removeStyleAndScript;
let transformA = baseTransform;
let transformB = baseTransform;
if (this.props.useWaybackResources) {
transformA = compose(transformA, loadSubresourcesFromWayback(
this.props.page,
this.props.a
));
transformB = compose(transformB, loadSubresourcesFromWayback(
this.props.page,
this.props.b
));
}

return (
<div className="side-by-side-render">
<SandboxedHtml
html={this.props.diffData.deletions}
baseUrl={this.props.page.url}
transform={transformDocument}
transform={transformA}
/>
<SandboxedHtml
html={this.props.diffData.insertions}
baseUrl={this.props.page.url}
transform={transformDocument}
transform={transformB}
/>
</div>
);
Expand Down
89 changes: 89 additions & 0 deletions src/scripts/html-transforms.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
export function compose (...transforms) {
transforms = transforms.filter(transform => !!transform);
if (transforms.length === 0) {
return x => x;
}

return (input) => {
transforms.reduce((output, transform) => {
return transform(output);
}, input);
};
}

export function removeStyleAndScript (document) {
// Stylesheets and scripts
document.querySelectorAll('link[rel="stylesheet"], style, script').forEach(node => {
Expand All @@ -14,3 +27,79 @@ export function removeStyleAndScript (document) {

return document;
}

/**
*
* @param {WebMonitoringDb.Page} page
* @param {WebMonitoringDb.Version} version
*/
export function loadSubresourcesFromWayback (page, version) {
return document => {
const timestamp = createWaybackTimestamp(version.capture_time);
document.querySelectorAll('link[rel="stylesheet"]').forEach(node => {
node.href = createWaybackUrl(node.getAttribute('href'), timestamp, page.url);
});

return document;
};
}


// ---------------------- Support Functions -----------------------------

/**
* Convert a Date object to to a Wayback-Machine style timestamp string.
* @param {Date} date A JS date object to convert
* @returns String
*/
function createWaybackTimestamp (date) {
return '' + date.getUTCFullYear()
+ twoDigit(date.getUTCMonth() + 1)
+ twoDigit(date.getUTCDate())
+ twoDigit(date.getUTCHours())
+ twoDigit(date.getUTCMinutes())
+ twoDigit(date.getUTCSeconds());
}

const PROTOCOL_PATTERN = /^[^/]+:\/\//;

/**
* Create a URL that points to a Wayback Machine-archived version of another
* URL near a particular date.
* @param {String} originalUrl URL of the resource to get from the Wayback Machine
* @param {Date|String} timestamp Date of the
*/
function createWaybackUrl (originalUrl, timestamp, baseUrl) {
if (typeof timestamp !== 'string') {
timestamp = createWaybackTimestamp(timestamp);
}

const url = resolveUrl(originalUrl, baseUrl);
return `https://web.archive.org/web/${timestamp}id_/${url}`;
}

/**
* Resolve a full URL from a relative one.
* @param {String} url The URL to resolve
* @param {String} baseUrl The base URL to resolve from
*/
function resolveUrl (url, baseUrl) {
if (url.startsWith('//')) {
return `https:${url}`;
}
else if (!PROTOCOL_PATTERN.test(url)) {
const base = new URL(baseUrl);
if (url.startsWith('/')) {
return `${base.origin}${url}`;
}
else {
const path = base.pathname.split('/').slice(0, -1).join('/');
return `${base.origin}${path}/${url}`;
}
}
return url;
}

function twoDigit (number) {
return number.toString().padStart(2, '0');
}

0 comments on commit 2ad85e5

Please sign in to comment.