diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx index baaf464a5e5b..8e46cde7a1b8 100644 --- a/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx @@ -4,17 +4,38 @@ hide_table_of_contents: true # Docx files -This example goes over how to load data from docx files. +The `DocxLoader` allows you to extract text data from Microsoft Word documents. It supports both the modern `.docx` format and the legacy `.doc` format. Depending on the file type, additional dependencies are required. -# Setup +--- + +## Setup + +To use `DocxLoader`, you'll need the `@langchain/community` integration along with either `mammoth` or `word-extractor` package: + +- **`mammoth`**: For processing `.docx` files. +- **`word-extractor`**: For handling `.doc` files. + +### Installation + +#### For `.docx` Files ```bash npm2yarn npm install @langchain/community @langchain/core mammoth ``` -# Usage +#### For `.doc` Files + +```bash npm2yarn +npm install @langchain/community @langchain/core word-extractor +``` + +## Usage + +### Loading `.docx` Files -```typescript +For `.docx` files, there is no need to explicitly specify any parameters when initializing the loader: + +```javascript import { DocxLoader } from "@langchain/community/document_loaders/fs/docx"; const loader = new DocxLoader( @@ -23,3 +44,20 @@ const loader = new DocxLoader( const docs = await loader.load(); ``` + +### Loading `.doc` Files + +For `.doc` files, you must explicitly specify the `type` as `doc` when initializing the loader: + +```javascript +import { DocxLoader } from "@langchain/community/document_loaders/fs/docx"; + +const loader = new DocxLoader( + "src/document_loaders/tests/example_data/attention.doc", + { + type: "doc", + } +); + +const docs = await loader.load(); +``` diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 81077ddd3343..4695474df42c 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -125,6 +125,7 @@ "@types/pg": "^8.11.0", "@types/pg-copy-streams": "^1.2.2", "@types/uuid": "^9", + "@types/word-extractor": "^1", "@types/ws": "^8", "@typescript-eslint/eslint-plugin": "^5.58.0", "@typescript-eslint/parser": "^5.58.0", @@ -217,6 +218,7 @@ "voy-search": "0.6.2", "weaviate-ts-client": "^1.4.0", "web-auth-library": "^1.0.3", + "word-extractor": "^1.0.4", "youtube-transcript": "^1.0.6", "youtubei.js": "^9.1.0" }, @@ -344,6 +346,7 @@ "voy-search": "0.6.2", "weaviate-ts-client": "*", "web-auth-library": "^1.0.3", + "word-extractor": "*", "ws": "^8.14.2", "youtube-transcript": "^1.0.6", "youtubei.js": "^9.1.0" @@ -703,6 +706,9 @@ "web-auth-library": { "optional": true }, + "word-extractor": { + "optional": true + }, "ws": { "optional": true }, diff --git a/libs/langchain-community/src/document_loaders/fs/docx.ts b/libs/langchain-community/src/document_loaders/fs/docx.ts index 72518aec3b2e..e1edef2fc8e7 100644 --- a/libs/langchain-community/src/document_loaders/fs/docx.ts +++ b/libs/langchain-community/src/document_loaders/fs/docx.ts @@ -1,19 +1,33 @@ import { Document } from "@langchain/core/documents"; import { BufferLoader } from "langchain/document_loaders/fs/buffer"; +type DocxLoaderOptions = { + type: "docx" | "doc"; +}; /** * A class that extends the `BufferLoader` class. It represents a document * loader that loads documents from DOCX files. + * It has a constructor that takes a `filePathOrBlob` parameter representing the path to the word + * file or a Blob object, and an optional `options` parameter of type + * `DocxLoaderOptions` */ export class DocxLoader extends BufferLoader { - constructor(filePathOrBlob: string | Blob) { + protected options: DocxLoaderOptions = { type: "docx" }; + + constructor(filePathOrBlob: string | Blob, options?: DocxLoaderOptions) { super(filePathOrBlob); + if (options) { + this.options = { + ...options, + }; + } } /** * A method that takes a `raw` buffer and `metadata` as parameters and * returns a promise that resolves to an array of `Document` instances. It - * uses the `extractRawText` function from the `mammoth` module to extract + * uses the `extractRawText` function from the `mammoth` module or + * `extract` method from the `word-extractor` module to extract * the raw text content from the buffer. If the extracted text content is * empty, it returns an empty array. Otherwise, it creates a new * `Document` instance with the extracted text content and the provided @@ -26,6 +40,31 @@ export class DocxLoader extends BufferLoader { raw: Buffer, metadata: Document["metadata"] ): Promise { + if (this.options.type === "doc") { + return this.parseDoc(raw, metadata); + } + return this.parseDocx(raw, metadata); + } + + /** + * A private method that takes a `raw` buffer and `metadata` as parameters and + * returns a promise that resolves to an array of `Document` instances. It + * uses the `extractRawText` function from the `mammoth` module to extract + * the raw text content from the buffer. If the extracted text content is + * empty, it returns an empty array. Otherwise, it creates a new + * `Document` instance with the extracted text content and the provided + * metadata, and returns it as an array. + * @param raw The raw buffer from which to extract text content. + * @param metadata The metadata to be associated with the created `Document` instance. + * @returns A promise that resolves to an array of `Document` instances. + */ + private async parseDocx( + raw: Buffer, + metadata: Document["metadata"] + ): Promise { + if (this.options.type === "doc") { + return this.parseDoc(raw, metadata); + } const { extractRawText } = await DocxLoaderImports(); const docx = await extractRawText({ buffer: raw, @@ -40,6 +79,33 @@ export class DocxLoader extends BufferLoader { }), ]; } + + /** + * A private method that takes a `raw` buffer and `metadata` as parameters and + * returns a promise that resolves to an array of `Document` instances. It + * uses the `extract` method from the `word-extractor` module to extract + * the raw text content from the buffer. If the extracted text content is + * empty, it returns an empty array. Otherwise, it creates a new + * `Document` instance with the extracted text content and the provided + * metadata, and returns it as an array. + * @param raw The raw buffer from which to extract text content. + * @param metadata The metadata to be associated with the created `Document` instance. + * @returns A promise that resolves to an array of `Document` instances. + */ + private async parseDoc( + raw: Buffer, + metadata: Document["metadata"] + ): Promise { + const WordExtractor = await DocLoaderImports(); + const extractor = new WordExtractor(); + const doc = await extractor.extract(raw); + return [ + new Document({ + pageContent: doc.getBody(), + metadata, + }), + ]; + } } async function DocxLoaderImports() { @@ -53,3 +119,15 @@ async function DocxLoaderImports() { ); } } + +async function DocLoaderImports() { + try { + const WordExtractor = await import("word-extractor"); + return WordExtractor.default; + } catch (e) { + console.error(e); + throw new Error( + "Failed to load word-extractor. Please install it with eg. `npm install word-extractor`." + ); + } +} diff --git a/libs/langchain-community/src/document_loaders/tests/docx.test.ts b/libs/langchain-community/src/document_loaders/tests/docx.test.ts index 63395bb51bc0..82e66aa91907 100644 --- a/libs/langchain-community/src/document_loaders/tests/docx.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/docx.test.ts @@ -3,7 +3,7 @@ import * as url from "node:url"; import * as path from "node:path"; import { DocxLoader } from "../fs/docx.js"; -test("Test Word doc loader from file", async () => { +test("Test Word doc loader from .docx file", async () => { const filePath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)), "./example_data/attention.docx" @@ -14,3 +14,17 @@ test("Test Word doc loader from file", async () => { expect(docs.length).toBe(1); // not much text in the example expect(docs[0].pageContent).toContain("an interesting activity"); }); + +test("Test Word doc loader from .doc file", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/attention.doc" + ); + const loader = new DocxLoader(filePath, { + type: "doc", + }); + const docs = await loader.load(); + + expect(docs.length).toBe(1); // not much text in the example + expect(docs[0].pageContent).toContain("an interesting activity"); +}); diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/attention.doc b/libs/langchain-community/src/document_loaders/tests/example_data/attention.doc new file mode 100644 index 000000000000..e68399c5a7c8 Binary files /dev/null and b/libs/langchain-community/src/document_loaders/tests/example_data/attention.doc differ diff --git a/yarn.lock b/yarn.lock index 590f795ce465..3d1769869beb 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11886,6 +11886,7 @@ __metadata: "@types/pg": ^8.11.0 "@types/pg-copy-streams": ^1.2.2 "@types/uuid": ^9 + "@types/word-extractor": ^1 "@types/ws": ^8 "@typescript-eslint/eslint-plugin": ^5.58.0 "@typescript-eslint/parser": ^5.58.0 @@ -11985,6 +11986,7 @@ __metadata: voy-search: 0.6.2 weaviate-ts-client: ^1.4.0 web-auth-library: ^1.0.3 + word-extractor: ^1.0.4 youtube-transcript: ^1.0.6 youtubei.js: ^9.1.0 zod: ^3.22.3 @@ -12113,6 +12115,7 @@ __metadata: voy-search: 0.6.2 weaviate-ts-client: "*" web-auth-library: ^1.0.3 + word-extractor: "*" ws: ^8.14.2 youtube-transcript: ^1.0.6 youtubei.js: ^9.1.0 @@ -12353,6 +12356,8 @@ __metadata: optional: true web-auth-library: optional: true + word-extractor: + optional: true ws: optional: true youtube-transcript: @@ -20168,6 +20173,15 @@ __metadata: languageName: node linkType: hard +"@types/word-extractor@npm:^1": + version: 1.0.6 + resolution: "@types/word-extractor@npm:1.0.6" + dependencies: + "@types/node": "*" + checksum: 3619602252493e1ad2671af6ce73a342cdc8452d5a3473123474ab6f3f5deb466b9c4cfdcbac75eacb430f545ae5708e3165b01e03b42053ca9fa87d2920fe3d + languageName: node + linkType: hard + "@types/ws@npm:8.5.9": version: 8.5.9 resolution: "@types/ws@npm:8.5.9" @@ -39965,6 +39979,15 @@ __metadata: languageName: node linkType: hard +"saxes@npm:^5.0.1": + version: 5.0.1 + resolution: "saxes@npm:5.0.1" + dependencies: + xmlchars: ^2.2.0 + checksum: 5636b55cf15f7cf0baa73f2797bf992bdcf75d1b39d82c0aa4608555c774368f6ac321cb641fd5f3d3ceb87805122cd47540da6a7b5960fe0dbdb8f8c263f000 + languageName: node + linkType: hard + "saxes@npm:^6.0.0": version: 6.0.0 resolution: "saxes@npm:6.0.0" @@ -44345,6 +44368,16 @@ __metadata: languageName: node linkType: hard +"word-extractor@npm:^1.0.4": + version: 1.0.4 + resolution: "word-extractor@npm:1.0.4" + dependencies: + saxes: ^5.0.1 + yauzl: ^2.10.0 + checksum: 04ed0ef1dfd6b26ab2094671e72f16e5a948f9978da3fd6b9d01ff475ecd048199f529d989f1d0dfe3da684a1aa8bb86e4388edabd706fd74a0be4eb030183cd + languageName: node + linkType: hard + "word-wrap@npm:^1.2.3, word-wrap@npm:~1.2.3": version: 1.2.3 resolution: "word-wrap@npm:1.2.3"