Skip to content

Commit

Permalink
support indexing HTTP request (and WARC) records:
Browse files Browse the repository at this point in the history
- support customizing --fields for cdxj indexing
- support 'req.*' fields which only apply to request records, other headers apply to response/main record
- support 'referrer' as special shortcut for 'req.http:referer'
- tests: update tests to include 'req.http:cookie' include in cdx
- tests: update tests to include 'referrer' in cdx

version: bump to 2.4.0
  • Loading branch information
ikreymer committed Nov 6, 2024
1 parent fb1ff9c commit 4499294
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 25 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "warcio",
"version": "2.3.1",
"version": "2.4.0",
"keywords": [
"WARC",
"web archiving"
Expand Down
5 changes: 5 additions & 0 deletions src/commands/args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ export const cdxIndexCommandArgs = (yarg: yargs.Argv) => {
describe:
"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",
type: "boolean",
})
.option("fields", {
alias: "f",
describe: "fields to include in index",
type: "string",
});
};

Expand Down
9 changes: 8 additions & 1 deletion src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ export type { WARCSerializerOpts } from "./warcserializer";
export { WARCRecord, WARC_1_1, WARC_1_0 } from "./warcrecord";
export type { WARCRecordOpts, WARCType } from "./warcrecord";

export { Indexer, CDXIndexer, CDXAndRecordIndexer } from "./indexer";
export {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_FIELDS,
DEFAULT_CDX_FIELDS,
DEFAULT_LEGACY_CDX_FIELDS,
} from "./indexer";

export {
postToGetUrl,
Expand Down
55 changes: 43 additions & 12 deletions src/lib/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,27 @@ import {
type IndexerOffsetLength,
} from "./types";

const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];
export const DEFAULT_FIELDS = ["offset", "warc-type", "warc-target-uri"];

// ===========================================================================
abstract class BaseIndexer {
opts: Partial<IndexCommandArgs>;
fields: string[];
reqFields: string[];
parseHttp: boolean;

constructor(opts: Partial<IndexCommandArgs> = {}) {
constructor(
opts: Partial<IndexCommandArgs> = {},
defaultFields: string[] = DEFAULT_FIELDS,
) {
this.opts = opts;
this.fields = opts.fields ? opts.fields.split(",") : DEFAULT_FIELDS;
if (opts.fields) {
this.fields = opts.fields.split(",");
this.reqFields = this.fields.filter((x) => isRequestHeader(x));
} else {
this.fields = defaultFields;
this.reqFields = [];
}
this.parseHttp = false;
}

Expand Down Expand Up @@ -109,6 +119,15 @@ abstract class BaseIndexer {
field: string,
record: WARCRecord,
): string | number | null | undefined {
// only handle req. fields for 'request' records
if (field.startsWith("req.")) {
if (record.warcType === "request") {
field = field.slice(4);
} else {
return null;
}
}

if (field === "http:status") {
if (
record.httpHeaders &&
Expand Down Expand Up @@ -136,8 +155,8 @@ abstract class BaseIndexer {

// ===========================================================================
export class Indexer extends BaseIndexer {
constructor(opts?: Partial<IndexCommandArgs>) {
super(opts);
constructor(opts?: Partial<IndexCommandArgs>, defaultFields?: string[]) {
super(opts, defaultFields);

for (const field of this.fields) {
if (field.startsWith("http:")) {
Expand All @@ -149,9 +168,9 @@ export class Indexer extends BaseIndexer {
}

// ===========================================================================
const DEFAULT_CDX_FIELDS =
export const DEFAULT_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(",");
const DEFAULT_LEGACY_CDX_FIELDS =
export const DEFAULT_LEGACY_CDX_FIELDS =
"urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(
",",
);
Expand All @@ -172,10 +191,9 @@ export class CDXIndexer extends Indexer {
_lastRecord: WARCRecord | null;

constructor(opts?: Partial<CdxIndexCommandArgs>) {
super(opts);
super(opts, DEFAULT_CDX_FIELDS);
this.includeAll = Boolean(opts?.all);
this.overrideIndexForAll = Boolean(opts?.all);
this.fields = DEFAULT_CDX_FIELDS;
this.parseHttp = true;
this.noSurt = Boolean(opts?.noSurt);
this._lastRecord = null;
Expand Down Expand Up @@ -322,6 +340,12 @@ export class CDXIndexer extends Indexer {
if (requestBody) {
res["requestBody"] = requestBody;
}

if (reqRecord && this.reqFields.length) {
for (const field of this.reqFields) {
this.setField(field, reqRecord, res);
}
}
}

return res;
Expand All @@ -334,12 +358,12 @@ export class CDXIndexer extends Indexer {
delete result["timestamp"];

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const replacer = (key: string, value: any) : any => {
const replacer = (key: string, value: any): any => {
if (["offset", "length", "status"].includes(key)) {
return value === null || value === undefined ? "" : "" + value;
}
return value;
}
};

return `${urlkey} ${timestamp} ${JSON.stringify(result, replacer)}\n`;
}
Expand Down Expand Up @@ -389,12 +413,15 @@ export class CDXIndexer extends Indexer {
case "status":
return super.getField("http:status", record);

case "referrer":
return super.getField("req.http:referer", record);

case "digest":
value = record.warcPayloadDigest;
return value ? value.split(":", 2)[1] : null;

default:
return null;
return super.getField(field, record);
}
}
}
Expand All @@ -416,3 +443,7 @@ export class CDXAndRecordIndexer extends CDXIndexer {
return cdx && { cdx, record, reqRecord };
}
}

export function isRequestHeader(header: string) {
return header.startsWith("req.") || header.toLowerCase() === "referrer";
}
10 changes: 7 additions & 3 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export function postToGetUrl(request: Request) {
return false;
}

const getContentType = (headers: Headers | Map<string, string>) : string => {
const getContentType = (headers: Headers | Map<string, string>): string => {
const ct = headers.get("content-type");
if (ct) {
return ct;
Expand All @@ -75,7 +75,7 @@ export function postToGetUrl(request: Request) {
}
}
return "";
}
};

const contentType = getContentType(headers);

Expand Down Expand Up @@ -124,7 +124,11 @@ export function postToGetUrl(request: Request) {
}

if (query != null) {
request.url = appendRequestQuery(request.url, decodeURI(query), request.method);
request.url = appendRequestQuery(
request.url,
decodeURI(query),
request.method,
);
request.method = "GET";
request.requestBody = query;
return true;
Expand Down
35 changes: 30 additions & 5 deletions test/testIndexer.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import fs from "fs";
import { jest } from "@jest/globals";
import { main } from "../src/commands";
import { Indexer, CDXIndexer, CDXAndRecordIndexer } from "../src/lib";
import {
Indexer,
CDXIndexer,
CDXAndRecordIndexer,
DEFAULT_CDX_FIELDS,
} from "../src/lib";
import { WritableStreamBuffer } from "stream-buffers";

function get_warc_path(filename: string) {
Expand Down Expand Up @@ -90,6 +95,21 @@ com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit",
);
});

test("cdxj warc.gz with referrer", async () => {
await index(
[
"cdx-index",
get_warc_path("data/example.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "referrer"].join(","),
],
`\
com,example)/ 20170306040206 {"url":"http://example.com/","mime":"text/html","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"1228","offset":"784","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
com,example)/ 20170306040348 {"url":"http://example.com/","mime":"warc/revisit","status":"200","digest":"G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK","length":"586","offset":"2621","filename":"example.warc.gz","referrer":"https://webrecorder.io/temp-MJFXHZ4S/temp/recording-session/record/http://example.com/"}
`,
);
});

test("cdx11 warc.gz", async () => {
await index(
["cdx-index", get_warc_path("data/example.warc.gz"), "--format", "cdx"],
Expand Down Expand Up @@ -154,11 +174,16 @@ com,example)/ 20170306040348 http://example.com/ warc/revisit 200 G7HRM7BGOKSKMS

test("post append", async () => {
await index(
["cdx-index", get_warc_path("data/post-test.warc.gz")],
[
"cdx-index",
get_warc_path("data/post-test.warc.gz"),
"--fields",
[...DEFAULT_CDX_FIELDS, "req.http:cookie"].join(","),
],
`\
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^"}
org,httpbin)/post?__wb_method=post&foo=bar&test=abc 20140610000859 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M532K5WS4GY2H4OVZO6HRPOP47A7KDWU","length":"720","offset":"0","filename":"post-test.warc.gz","method":"POST","requestBody":"foo=bar&test=abc","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&a=1&b=[]&c=3 20140610001151 {"url":"http://httpbin.org/post","mime":"application/json","status":"200","digest":"M7YCTM7HS3YKYQTAWQVMQSQZBNEOXGU2","length":"723","offset":"1196","filename":"post-test.warc.gz","method":"POST","requestBody":"A=1&B=[]&C=3","req.http:cookie":"Max-Age=3600; Path=/"}
org,httpbin)/post?__wb_method=post&data=^&foo=bar 20140610001255 {"url":"http://httpbin.org/post?foo=bar","mime":"application/json","status":"200","digest":"B6E5P6JUZI6UPDTNO4L2BCHMGLTNCUAJ","length":"723","offset":"2395","filename":"post-test.warc.gz","method":"POST","requestBody":"data=^","req.http:cookie":"Max-Age=3600; Path=/"}
`,
);
});
Expand Down
6 changes: 3 additions & 3 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ describe("utils", () => {
});

test("surt with space", () => {
expect(getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d ")).toBe(
"com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h="
);
expect(
getSurt("https://www.example.com/some/path?e+f=&a b&a+b=c&g^h=&d "),
).toBe("com,example)/some/path?a%20b&a+b=c&d&e+f=&g^h=");
});
});

0 comments on commit 4499294

Please sign in to comment.