Skip to content

Commit

Permalink
feat(community): Extend DocxLoader to load .doc files (#7421)
Browse files Browse the repository at this point in the history
  • Loading branch information
Fibii authored Dec 31, 2024
1 parent 33c3d73 commit 2abf88e
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,38 @@ hide_table_of_contents: true

# Docx files

This example goes over how to load data from docx files.
The `DocxLoader` allows you to extract text data from Microsoft Word documents. It supports both the modern `.docx` format and the legacy `.doc` format. Depending on the file type, additional dependencies are required.

# Setup
---

## Setup

To use `DocxLoader`, you'll need the `@langchain/community` integration along with either `mammoth` or `word-extractor` package:

- **`mammoth`**: For processing `.docx` files.
- **`word-extractor`**: For handling `.doc` files.

### Installation

#### For `.docx` Files

```bash npm2yarn
npm install @langchain/community @langchain/core mammoth
```

# Usage
#### For `.doc` Files

```bash npm2yarn
npm install @langchain/community @langchain/core word-extractor
```

## Usage

### Loading `.docx` Files

```typescript
For `.docx` files, there is no need to explicitly specify any parameters when initializing the loader:

```javascript
import { DocxLoader } from "@langchain/community/document_loaders/fs/docx";

const loader = new DocxLoader(
Expand All @@ -23,3 +44,20 @@ const loader = new DocxLoader(

const docs = await loader.load();
```

### Loading `.doc` Files

For `.doc` files, you must explicitly specify the `type` as `doc` when initializing the loader:

```javascript
import { DocxLoader } from "@langchain/community/document_loaders/fs/docx";

const loader = new DocxLoader(
"src/document_loaders/tests/example_data/attention.doc",
{
type: "doc",
}
);

const docs = await loader.load();
```
6 changes: 6 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
"@types/pg": "^8.11.0",
"@types/pg-copy-streams": "^1.2.2",
"@types/uuid": "^9",
"@types/word-extractor": "^1",
"@types/ws": "^8",
"@typescript-eslint/eslint-plugin": "^5.58.0",
"@typescript-eslint/parser": "^5.58.0",
Expand Down Expand Up @@ -217,6 +218,7 @@
"voy-search": "0.6.2",
"weaviate-ts-client": "^1.4.0",
"web-auth-library": "^1.0.3",
"word-extractor": "^1.0.4",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
},
Expand Down Expand Up @@ -344,6 +346,7 @@
"voy-search": "0.6.2",
"weaviate-ts-client": "*",
"web-auth-library": "^1.0.3",
"word-extractor": "*",
"ws": "^8.14.2",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
Expand Down Expand Up @@ -703,6 +706,9 @@
"web-auth-library": {
"optional": true
},
"word-extractor": {
"optional": true
},
"ws": {
"optional": true
},
Expand Down
82 changes: 80 additions & 2 deletions libs/langchain-community/src/document_loaders/fs/docx.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
import { Document } from "@langchain/core/documents";
import { BufferLoader } from "langchain/document_loaders/fs/buffer";

type DocxLoaderOptions = {
type: "docx" | "doc";
};
/**
* A class that extends the `BufferLoader` class. It represents a document
* loader that loads documents from DOCX files.
* It has a constructor that takes a `filePathOrBlob` parameter representing the path to the word
* file or a Blob object, and an optional `options` parameter of type
* `DocxLoaderOptions`
*/
export class DocxLoader extends BufferLoader {
constructor(filePathOrBlob: string | Blob) {
protected options: DocxLoaderOptions = { type: "docx" };

constructor(filePathOrBlob: string | Blob, options?: DocxLoaderOptions) {
super(filePathOrBlob);
if (options) {
this.options = {
...options,
};
}
}

/**
* A method that takes a `raw` buffer and `metadata` as parameters and
* returns a promise that resolves to an array of `Document` instances. It
* uses the `extractRawText` function from the `mammoth` module to extract
* uses the `extractRawText` function from the `mammoth` module or
* `extract` method from the `word-extractor` module to extract
* the raw text content from the buffer. If the extracted text content is
* empty, it returns an empty array. Otherwise, it creates a new
* `Document` instance with the extracted text content and the provided
Expand All @@ -26,6 +40,31 @@ export class DocxLoader extends BufferLoader {
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]> {
if (this.options.type === "doc") {
return this.parseDoc(raw, metadata);
}
return this.parseDocx(raw, metadata);
}

/**
* A private method that takes a `raw` buffer and `metadata` as parameters and
* returns a promise that resolves to an array of `Document` instances. It
* uses the `extractRawText` function from the `mammoth` module to extract
* the raw text content from the buffer. If the extracted text content is
* empty, it returns an empty array. Otherwise, it creates a new
* `Document` instance with the extracted text content and the provided
* metadata, and returns it as an array.
* @param raw The raw buffer from which to extract text content.
* @param metadata The metadata to be associated with the created `Document` instance.
* @returns A promise that resolves to an array of `Document` instances.
*/
private async parseDocx(
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]> {
if (this.options.type === "doc") {
return this.parseDoc(raw, metadata);
}
const { extractRawText } = await DocxLoaderImports();
const docx = await extractRawText({
buffer: raw,
Expand All @@ -40,6 +79,33 @@ export class DocxLoader extends BufferLoader {
}),
];
}

/**
* A private method that takes a `raw` buffer and `metadata` as parameters and
* returns a promise that resolves to an array of `Document` instances. It
* uses the `extract` method from the `word-extractor` module to extract
* the raw text content from the buffer. If the extracted text content is
* empty, it returns an empty array. Otherwise, it creates a new
* `Document` instance with the extracted text content and the provided
* metadata, and returns it as an array.
* @param raw The raw buffer from which to extract text content.
* @param metadata The metadata to be associated with the created `Document` instance.
* @returns A promise that resolves to an array of `Document` instances.
*/
private async parseDoc(
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]> {
const WordExtractor = await DocLoaderImports();
const extractor = new WordExtractor();
const doc = await extractor.extract(raw);
return [
new Document({
pageContent: doc.getBody(),
metadata,
}),
];
}
}

async function DocxLoaderImports() {
Expand All @@ -53,3 +119,15 @@ async function DocxLoaderImports() {
);
}
}

async function DocLoaderImports() {
try {
const WordExtractor = await import("word-extractor");
return WordExtractor.default;
} catch (e) {
console.error(e);
throw new Error(
"Failed to load word-extractor. Please install it with eg. `npm install word-extractor`."
);
}
}
16 changes: 15 additions & 1 deletion libs/langchain-community/src/document_loaders/tests/docx.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import * as url from "node:url";
import * as path from "node:path";
import { DocxLoader } from "../fs/docx.js";

test("Test Word doc loader from file", async () => {
test("Test Word doc loader from .docx file", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/attention.docx"
Expand All @@ -14,3 +14,17 @@ test("Test Word doc loader from file", async () => {
expect(docs.length).toBe(1); // not much text in the example
expect(docs[0].pageContent).toContain("an interesting activity");
});

test("Test Word doc loader from .doc file", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/attention.doc"
);
const loader = new DocxLoader(filePath, {
type: "doc",
});
const docs = await loader.load();

expect(docs.length).toBe(1); // not much text in the example
expect(docs[0].pageContent).toContain("an interesting activity");
});
Binary file not shown.
33 changes: 33 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -11886,6 +11886,7 @@ __metadata:
"@types/pg": ^8.11.0
"@types/pg-copy-streams": ^1.2.2
"@types/uuid": ^9
"@types/word-extractor": ^1
"@types/ws": ^8
"@typescript-eslint/eslint-plugin": ^5.58.0
"@typescript-eslint/parser": ^5.58.0
Expand Down Expand Up @@ -11985,6 +11986,7 @@ __metadata:
voy-search: 0.6.2
weaviate-ts-client: ^1.4.0
web-auth-library: ^1.0.3
word-extractor: ^1.0.4
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
zod: ^3.22.3
Expand Down Expand Up @@ -12113,6 +12115,7 @@ __metadata:
voy-search: 0.6.2
weaviate-ts-client: "*"
web-auth-library: ^1.0.3
word-extractor: "*"
ws: ^8.14.2
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
Expand Down Expand Up @@ -12353,6 +12356,8 @@ __metadata:
optional: true
web-auth-library:
optional: true
word-extractor:
optional: true
ws:
optional: true
youtube-transcript:
Expand Down Expand Up @@ -20168,6 +20173,15 @@ __metadata:
languageName: node
linkType: hard

"@types/word-extractor@npm:^1":
version: 1.0.6
resolution: "@types/word-extractor@npm:1.0.6"
dependencies:
"@types/node": "*"
checksum: 3619602252493e1ad2671af6ce73a342cdc8452d5a3473123474ab6f3f5deb466b9c4cfdcbac75eacb430f545ae5708e3165b01e03b42053ca9fa87d2920fe3d
languageName: node
linkType: hard

"@types/ws@npm:8.5.9":
version: 8.5.9
resolution: "@types/ws@npm:8.5.9"
Expand Down Expand Up @@ -39965,6 +39979,15 @@ __metadata:
languageName: node
linkType: hard

"saxes@npm:^5.0.1":
version: 5.0.1
resolution: "saxes@npm:5.0.1"
dependencies:
xmlchars: ^2.2.0
checksum: 5636b55cf15f7cf0baa73f2797bf992bdcf75d1b39d82c0aa4608555c774368f6ac321cb641fd5f3d3ceb87805122cd47540da6a7b5960fe0dbdb8f8c263f000
languageName: node
linkType: hard

"saxes@npm:^6.0.0":
version: 6.0.0
resolution: "saxes@npm:6.0.0"
Expand Down Expand Up @@ -44345,6 +44368,16 @@ __metadata:
languageName: node
linkType: hard

"word-extractor@npm:^1.0.4":
version: 1.0.4
resolution: "word-extractor@npm:1.0.4"
dependencies:
saxes: ^5.0.1
yauzl: ^2.10.0
checksum: 04ed0ef1dfd6b26ab2094671e72f16e5a948f9978da3fd6b9d01ff475ecd048199f529d989f1d0dfe3da684a1aa8bb86e4388edabd706fd74a0be4eb030183cd
languageName: node
linkType: hard

"word-wrap@npm:^1.2.3, word-wrap@npm:~1.2.3":
version: 1.2.3
resolution: "word-wrap@npm:1.2.3"
Expand Down

0 comments on commit 2abf88e

Please sign in to comment.