Skip to content

Commit

Permalink
post request body processing:
Browse files Browse the repository at this point in the history
- group request-responses into pairs, process request body when indexing
- convert form-encoded, multipart, and json post requests into an extra string stored in requestBody
- expose getSurt, postToGet (moved from wabac.js)
- bump to 1.4.0
  • Loading branch information
ikreymer committed Feb 22, 2021
1 parent a823bd3 commit f865b19
Show file tree
Hide file tree
Showing 9 changed files with 463 additions and 58 deletions.
246 changes: 218 additions & 28 deletions dist/warcio.js
Original file line number Diff line number Diff line change
Expand Up @@ -8402,6 +8402,124 @@ class WARCSerializer extends BaseAsyncIterReader
}
}

function getSurt(url) {
try {
if (!url.startsWith("https:") && !url.startsWith("http:")) {
return url;
}
url = url.replace(/www\d*\./, '');
const urlObj = new URL(url.toLowerCase());

const hostParts = urlObj.hostname.split(".").reverse();
let surt = hostParts.join(",");
if (urlObj.port) {
surt += ":" + urlObj.port;
}
surt += ")";
surt += urlObj.pathname;
if (urlObj.search) {
urlObj.searchParams.sort();
surt += urlObj.search;
}
return surt;
} catch (e) {
return url;
}
}

function postToGetUrl(request) {
let {url, method, headers, postData} = request;

if (method === "GET") {
return false;
}

const requestMime = (headers.get("content-type") || "").split(";")[0];

function decodeIfNeeded(postData) {
if (postData instanceof Uint8Array) {
postData = new TextDecoder().decode(postData);
}
return postData;
}

let query = null;

switch (requestMime) {
case "application/x-www-form-urlencoded":
query = decodeIfNeeded(postData);
break;

case "text/plain":
case "application/json":
query = jsonToQueryString(decodeIfNeeded(postData));
break;

case "multipart/form-data":
query = mfdToQueryString(decodeIfNeeded(postData), headers.get("content-type"));
break;

default:
return false;
}

if (query) {
const start = (url.indexOf("?") > 0 ? "&" : "?");
request.url += `${start}__wb_method=${method}&${query}`;
request.method = "GET";
return true;
}

return false;
}

function jsonToQueryString(json) {
if (typeof(json) === "string") {
try {
json = JSON.parse(json);
} catch(e) {
json = {};
}
}

const q = new URLSearchParams();

try {
JSON.stringify(json, (k, v) => {
if (!["object", "function"].includes(typeof(v))) {
q.set(k, v);
}
return v;
});
} catch (e) {}

return q.toString();
}

function mfdToQueryString(mfd, contentType) {
const params = new URLSearchParams();

if (mfd instanceof Uint8Array) {
mfd = new TextDecoder().decode(mfd);
}

try {
const boundary = contentType.split("boundary=")[1];

const parts = mfd.split(new RegExp("-*" + boundary + "-*", "mi"));

for (let i = 0; i < parts.length; i++) {
const m = parts[i].trim().match(/name="([^"]+)"\r\n\r\n(.*)/mi);
if (m) {
params.set(m[1], m[2]);
}
}

} catch (e) {}

return params.toString();
}

const DEFAULT_FIELDS = 'offset,warc-type,warc-target-uri'.split(',');


Expand Down Expand Up @@ -8437,12 +8555,16 @@ class BaseIndexer

const parser = new WARCParser(reader, params);

for await (const record of parser) {
await record.skipFully();
const result = this.indexRecord(record, parser, filename);
if (result) {
yield result;
}
yield* this.iterRecords(parser, filename);
}
}

async* iterRecords(parser, filename) {
for await (const record of parser) {
await record.skipFully();
const result = this.indexRecord(record, parser, filename);
if (result) {
yield result;
}
}
}
Expand Down Expand Up @@ -8534,6 +8656,7 @@ class CDXIndexer extends Indexer
this.includeAll = opts.all;
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this._lastRecord = null;

switch (opts.format) {
case "cdxj":
Expand All @@ -8546,6 +8669,23 @@ class CDXIndexer extends Indexer
}
}

async* iterRecords(parser, filename) {
this._lastRecord = null;

for await (const record of parser) {
await record.readFully();
const result = this.indexRecord(record, parser, filename);
if (result) {
yield result;
}
}

const result = this.indexRecord(null, parser, filename);
if (result) {
yield result;
}
}

filterRecord(record) {
if (this.includeAll) {
return true;
Expand All @@ -8559,6 +8699,76 @@ class CDXIndexer extends Indexer
return true;
}

indexRecord(record, parser, filename) {
if (this.includeAll) {
if (!record) {
return;
}
return super.indexRecord(record, parser, filename);
}

const lastRecord = this._lastRecord;

if (record) {
record._offset = parser.offset;
record._length = parser.recordLength;
}

if (!lastRecord) {
this._lastRecord = record;
return null;
}

if (!record || lastRecord.warcTargetURI != record.warcTargetURI) {
this._lastRecord = record;
return this.indexRecordPair(lastRecord, null, parser, filename);
}

if (record.warcType === "request" && lastRecord.warcType === "response") {
this._lastRecord = null;
return this.indexRecordPair(lastRecord, record, parser, filename);
} else if (record.warcType === "response" && lastRecord.warcType === "request") {
this._lastRecord = null;
return this.indexRecordPair(record, lastRecord, parser, filename);
} else {
this._lastRecord = record;
return this.indexRecordPair(lastRecord, null, parser, filename);
}
}

indexRecordPair(record, reqRecord, parser, filename) {
let method;
let requestBody;

if (reqRecord && reqRecord.httpHeaders.method !== "GET") {
const request = {
url: record.warcTargetURI,
method: reqRecord.httpHeaders.method,
headers: reqRecord.httpHeaders.headers,
postData: reqRecord.payload,
};

method = request.method;

if (postToGetUrl(request)) {
requestBody = request.url.slice(record.warcTargetURI.length);
}
}

const res = super.indexRecord(record, parser, filename);
if (res && record && record._offset !== undefined) {
res.offset = record._offset;
res.length = record._length;
}
if (method) {
res.method = method;
}
if (requestBody) {
res.requestBody = requestBody;
}
return res;
}

serializeCDXJ(result) {
const { urlkey, timestamp } = result;
delete result.urlkey;
Expand All @@ -8582,7 +8792,7 @@ class CDXIndexer extends Indexer

switch (field) {
case "urlkey":
return this.getSurt(record.warcTargetURI);
return getSurt(record.updatedURL ? record.updatedURL : record.warcTargetURI);

case "timestamp":
value = record.warcDate;
Expand Down Expand Up @@ -8616,26 +8826,6 @@ class CDXIndexer extends Indexer
return value ? value.split(":", 2)[1] : null;
}
}

getSurt(url) {
try {
const urlObj = new URL(url);
if (urlObj.protocol !== "http:" && urlObj.protocol !== "https:") {
return url;
}

const hostParts = urlObj.hostname.split(".").reverse();
let surt = hostParts.join(",");
if (urlObj.port) {
surt += ":" + urlObj.port;
}
surt += ")";
surt += urlObj.pathname;
return surt.toLowerCase();
} catch (e) {
return url;
}
}
}

export { AsyncIterReader, BaseAsyncIterReader, CDXIndexer, Indexer, LimitReader, StatusAndHeaders, StatusAndHeadersParser, WARCParser, WARCRecord, WARCSerializer };
export { AsyncIterReader, BaseAsyncIterReader, CDXIndexer, Indexer, LimitReader, StatusAndHeaders, StatusAndHeadersParser, WARCParser, WARCRecord, WARCSerializer, getSurt, postToGetUrl };
4 changes: 2 additions & 2 deletions dist/warcio.min.js

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ export { WARCSerializer } from './src/warcserializer';
export { WARCRecord } from './src/warcrecord';

export { Indexer, CDXIndexer } from './src/indexer';

export { postToGetUrl, getSurt } from './src/utils';
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "1.3.3",
"version": "1.4.0",
"main": "index.js",
"module": "main.js",
"license": "Apache-2.0",
Expand Down
Loading

0 comments on commit f865b19

Please sign in to comment.