Skip to content

Commit

Permalink
indexing: add 'noSurt' option for urlkey, simplify request-body appen…
Browse files Browse the repository at this point in the history
…d logic in indexer
  • Loading branch information
ikreymer committed Feb 27, 2021
1 parent 92c5b55 commit 57ea3b0
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 11 deletions.
31 changes: 25 additions & 6 deletions dist/warcio.js
Original file line number Diff line number Diff line change
Expand Up @@ -8464,15 +8464,25 @@ function postToGetUrl(request) {
}

if (query) {
const start = (url.indexOf("?") > 0 ? "&" : "?");
request.url += `${start}__wb_method=${method}&${query}`;
request.url = appendRequestQuery(request.url, query, request.method);
request.method = "GET";
request.requestBody = query;
return true;
}

return false;
}

function appendRequestQuery(url, query, method) {
if (!method) {
return url;
}

const start = (url.indexOf("?") > 0 ? "&" : "?");

return `${url}${start}__wb_method=${method}&${query}`;
}

function jsonToQueryString(json) {
if (typeof(json) === "string") {
try {
Expand Down Expand Up @@ -8656,6 +8666,7 @@ class CDXIndexer extends Indexer
this.includeAll = opts.all;
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this.noSurt = !!opts.noSurt;
this._lastRecord = null;

switch (opts.format) {
Expand Down Expand Up @@ -8739,10 +8750,11 @@ class CDXIndexer extends Indexer
indexRecordPair(record, reqRecord, parser, filename) {
let method;
let requestBody;
let url = record.warcTargetURI;

if (reqRecord && reqRecord.httpHeaders.method !== "GET") {
const request = {
url: record.warcTargetURI,
url,
method: reqRecord.httpHeaders.method,
headers: reqRecord.httpHeaders.headers,
postData: reqRecord.payload,
Expand All @@ -8751,10 +8763,15 @@ class CDXIndexer extends Indexer
method = request.method;

if (postToGetUrl(request)) {
requestBody = request.url.slice(record.warcTargetURI.length);
requestBody = request.requestBody;
record.method = method;
record.requestBody = requestBody;
url = request.url;
}
}

record._urlkey = url;

const res = super.indexRecord(record, parser, filename);
if (res && record && record._offset !== undefined) {
res.offset = record._offset;
Expand All @@ -8766,6 +8783,7 @@ class CDXIndexer extends Indexer
if (requestBody) {
res.requestBody = requestBody;
}

return res;
}

Expand All @@ -8792,7 +8810,8 @@ class CDXIndexer extends Indexer

switch (field) {
case "urlkey":
return getSurt(record.updatedURL ? record.updatedURL : record.warcTargetURI);
value = record._urlkey ? record._urlkey : record.warcTargetURI;
return this.noSurt ? value : getSurt(value);

case "timestamp":
value = record.warcDate;
Expand Down Expand Up @@ -8828,4 +8847,4 @@ class CDXIndexer extends Indexer
}
}

export { AsyncIterReader, BaseAsyncIterReader, CDXIndexer, Indexer, LimitReader, StatusAndHeaders, StatusAndHeadersParser, WARCParser, WARCRecord, WARCSerializer, getSurt, postToGetUrl };
export { AsyncIterReader, BaseAsyncIterReader, CDXIndexer, Indexer, LimitReader, StatusAndHeaders, StatusAndHeadersParser, WARCParser, WARCRecord, WARCSerializer, appendRequestQuery, getSurt, postToGetUrl };
4 changes: 2 additions & 2 deletions dist/warcio.min.js

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/cli_main.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ function main(args, out) {
'describe': 'output format',
'choices': ['json', 'cdxj', 'cdx'],
'default': 'cdxj'
}).
option('noSurt', {
'describe': 'Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)',
'type': 'boolean',
})
}, async (args) => {
/* istanbul ignore next */
Expand Down
13 changes: 10 additions & 3 deletions src/indexer.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { WARCParser } from './warcparser';

const DEFAULT_FIELDS = 'offset,warc-type,warc-target-uri'.split(',');

import { postToGetUrl, getSurt, appendRequestQuery } from './utils';
import { postToGetUrl, getSurt } from './utils';


// ===========================================================================
Expand Down Expand Up @@ -138,6 +138,7 @@ class CDXIndexer extends Indexer
this.includeAll = opts.all;
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this.noSurt = !!opts.noSurt;
this._lastRecord = null;

switch (opts.format) {
Expand Down Expand Up @@ -226,10 +227,11 @@ class CDXIndexer extends Indexer
indexRecordPair(record, reqRecord, parser, filename) {
let method;
let requestBody;
let url = record.warcTargetURI;

if (reqRecord && reqRecord.httpHeaders.method !== "GET") {
const request = {
url: record.warcTargetURI,
url,
method: reqRecord.httpHeaders.method,
headers: reqRecord.httpHeaders.headers,
postData: reqRecord.payload,
Expand All @@ -241,9 +243,12 @@ class CDXIndexer extends Indexer
requestBody = request.requestBody;
record.method = method;
record.requestBody = requestBody;
url = request.url;
}
}

record._urlkey = url;

const res = super.indexRecord(record, parser, filename);
if (res && record && record._offset !== undefined) {
res.offset = record._offset;
Expand All @@ -255,6 +260,7 @@ class CDXIndexer extends Indexer
if (requestBody) {
res.requestBody = requestBody;
}

return res;
}

Expand All @@ -281,7 +287,8 @@ class CDXIndexer extends Indexer

switch (field) {
case "urlkey":
return getSurt(appendRequestQuery(record.warcTargetURI, record.requestBody, record.method));
value = record._urlkey ? record._urlkey : record.warcTargetURI;
return this.noSurt ? value : getSurt(value);

case "timestamp":
value = record.warcDate;
Expand Down

0 comments on commit 57ea3b0

Please sign in to comment.