feat(community): Extend DocxLoader to load .doc files (#7421)

langchain-ai · Dec 31, 2024 · 2abf88e · 2abf88e
1 parent 33c3d73
commit 2abf88e
Show file tree

Hide file tree

Showing 6 changed files with 176 additions and 7 deletions.
diff --git a/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx b/docs/core_docs/docs/integrations/document_loaders/file_loaders/docx.mdx
@@ -4,17 +4,38 @@ hide_table_of_contents: true
 
 # Docx files
 
-This example goes over how to load data from docx files.
+The `DocxLoader` allows you to extract text data from Microsoft Word documents. It supports both the modern `.docx` format and the legacy `.doc` format. Depending on the file type, additional dependencies are required.
 
-# Setup
+---
+
+## Setup
+
+To use `DocxLoader`, you'll need the `@langchain/community` integration along with either `mammoth` or `word-extractor` package:
+
+- **`mammoth`**: For processing `.docx` files.
+- **`word-extractor`**: For handling `.doc` files.
+
+### Installation
+
+#### For `.docx` Files
 
 ```bash npm2yarn
 npm install @langchain/community @langchain/core mammoth
 ```
 
-# Usage
+#### For `.doc` Files
+
+```bash npm2yarn
+npm install @langchain/community @langchain/core word-extractor
+```
+
+## Usage
+
+### Loading `.docx` Files
 
-```typescript
+For `.docx` files, there is no need to explicitly specify any parameters when initializing the loader:
+
+```javascript
 import { DocxLoader } from "@langchain/community/document_loaders/fs/docx";
 
 const loader = new DocxLoader(
@@ -23,3 +44,20 @@ const loader = new DocxLoader(
 
 const docs = await loader.load();
 ```
+
+### Loading `.doc` Files
+
+For `.doc` files, you must explicitly specify the `type` as `doc` when initializing the loader:
+
+```javascript
+import { DocxLoader } from "@langchain/community/document_loaders/fs/docx";
+
+const loader = new DocxLoader(
+  "src/document_loaders/tests/example_data/attention.doc",
+  {
+    type: "doc",
+  }
+);
+
+const docs = await loader.load();
+```
diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
@@ -125,6 +125,7 @@
     "@types/pg": "^8.11.0",
     "@types/pg-copy-streams": "^1.2.2",
     "@types/uuid": "^9",
+    "@types/word-extractor": "^1",
     "@types/ws": "^8",
     "@typescript-eslint/eslint-plugin": "^5.58.0",
     "@typescript-eslint/parser": "^5.58.0",
@@ -217,6 +218,7 @@
     "voy-search": "0.6.2",
     "weaviate-ts-client": "^1.4.0",
     "web-auth-library": "^1.0.3",
+    "word-extractor": "^1.0.4",
     "youtube-transcript": "^1.0.6",
     "youtubei.js": "^9.1.0"
   },
@@ -344,6 +346,7 @@
     "voy-search": "0.6.2",
     "weaviate-ts-client": "*",
     "web-auth-library": "^1.0.3",
+    "word-extractor": "*",
     "ws": "^8.14.2",
     "youtube-transcript": "^1.0.6",
     "youtubei.js": "^9.1.0"
@@ -703,6 +706,9 @@
     "web-auth-library": {
       "optional": true
     },
+    "word-extractor": {
+      "optional": true
+    },
     "ws": {
       "optional": true
     },

diff --git a/libs/langchain-community/src/document_loaders/fs/docx.ts b/libs/langchain-community/src/document_loaders/fs/docx.ts
@@ -1,19 +1,33 @@
 import { Document } from "@langchain/core/documents";
 import { BufferLoader } from "langchain/document_loaders/fs/buffer";
 
+type DocxLoaderOptions = {
+  type: "docx" | "doc";
+};
 /**
  * A class that extends the `BufferLoader` class. It represents a document
  * loader that loads documents from DOCX files.
+ * It has a constructor that takes a `filePathOrBlob` parameter representing the path to the word
+ * file or a Blob object, and an optional `options` parameter of type
+ * `DocxLoaderOptions`
  */
 export class DocxLoader extends BufferLoader {
-  constructor(filePathOrBlob: string | Blob) {
+  protected options: DocxLoaderOptions = { type: "docx" };
+
+  constructor(filePathOrBlob: string | Blob, options?: DocxLoaderOptions) {
     super(filePathOrBlob);
+    if (options) {
+      this.options = {
+        ...options,
+      };
+    }
   }
 
   /**
    * A method that takes a `raw` buffer and `metadata` as parameters and
    * returns a promise that resolves to an array of `Document` instances. It
-   * uses the `extractRawText` function from the `mammoth` module to extract
+   * uses the `extractRawText` function from the `mammoth` module or
+   * `extract` method from the `word-extractor` module to extract
    * the raw text content from the buffer. If the extracted text content is
    * empty, it returns an empty array. Otherwise, it creates a new
    * `Document` instance with the extracted text content and the provided
@@ -26,6 +40,31 @@ export class DocxLoader extends BufferLoader {
     raw: Buffer,
     metadata: Document["metadata"]
   ): Promise<Document[]> {
+    if (this.options.type === "doc") {
+      return this.parseDoc(raw, metadata);
+    }
+    return this.parseDocx(raw, metadata);
+  }
+
+  /**
+   * A private method that takes a `raw` buffer and `metadata` as parameters and
+   * returns a promise that resolves to an array of `Document` instances. It
+   * uses the `extractRawText` function from the `mammoth` module to extract
+   * the raw text content from the buffer. If the extracted text content is
+   * empty, it returns an empty array. Otherwise, it creates a new
+   * `Document` instance with the extracted text content and the provided
+   * metadata, and returns it as an array.
+   * @param raw The raw buffer from which to extract text content.
+   * @param metadata The metadata to be associated with the created `Document` instance.
+   * @returns A promise that resolves to an array of `Document` instances.
+   */
+  private async parseDocx(
+    raw: Buffer,
+    metadata: Document["metadata"]
+  ): Promise<Document[]> {
+    if (this.options.type === "doc") {
+      return this.parseDoc(raw, metadata);
+    }
     const { extractRawText } = await DocxLoaderImports();
     const docx = await extractRawText({
       buffer: raw,
@@ -40,6 +79,33 @@ export class DocxLoader extends BufferLoader {
       }),
     ];
   }
+
+  /**
+   * A private method that takes a `raw` buffer and `metadata` as parameters and
+   * returns a promise that resolves to an array of `Document` instances. It
+   * uses the `extract` method from the `word-extractor` module to extract
+   * the raw text content from the buffer. If the extracted text content is
+   * empty, it returns an empty array. Otherwise, it creates a new
+   * `Document` instance with the extracted text content and the provided
+   * metadata, and returns it as an array.
+   * @param raw The raw buffer from which to extract text content.
+   * @param metadata The metadata to be associated with the created `Document` instance.
+   * @returns A promise that resolves to an array of `Document` instances.
+   */
+  private async parseDoc(
+    raw: Buffer,
+    metadata: Document["metadata"]
+  ): Promise<Document[]> {
+    const WordExtractor = await DocLoaderImports();
+    const extractor = new WordExtractor();
+    const doc = await extractor.extract(raw);
+    return [
+      new Document({
+        pageContent: doc.getBody(),
+        metadata,
+      }),
+    ];
+  }
 }
 
 async function DocxLoaderImports() {
@@ -53,3 +119,15 @@ async function DocxLoaderImports() {
     );
   }
 }
+
+async function DocLoaderImports() {
+  try {
+    const WordExtractor = await import("word-extractor");
+    return WordExtractor.default;
+  } catch (e) {
+    console.error(e);
+    throw new Error(
+      "Failed to load word-extractor. Please install it with eg. `npm install word-extractor`."
+    );
+  }
+}
diff --git a/libs/langchain-community/src/document_loaders/tests/docx.test.ts b/libs/langchain-community/src/document_loaders/tests/docx.test.ts
@@ -3,7 +3,7 @@ import * as url from "node:url";
 import * as path from "node:path";
 import { DocxLoader } from "../fs/docx.js";
 
-test("Test Word doc loader from file", async () => {
+test("Test Word doc loader from .docx file", async () => {
   const filePath = path.resolve(
     path.dirname(url.fileURLToPath(import.meta.url)),
     "./example_data/attention.docx"
@@ -14,3 +14,17 @@ test("Test Word doc loader from file", async () => {
   expect(docs.length).toBe(1); // not much text in the example
   expect(docs[0].pageContent).toContain("an interesting activity");
 });
+
+test("Test Word doc loader from .doc file", async () => {
+  const filePath = path.resolve(
+    path.dirname(url.fileURLToPath(import.meta.url)),
+    "./example_data/attention.doc"
+  );
+  const loader = new DocxLoader(filePath, {
+    type: "doc",
+  });
+  const docs = await loader.load();
+
+  expect(docs.length).toBe(1); // not much text in the example
+  expect(docs[0].pageContent).toContain("an interesting activity");
+});
diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/attention.doc b/libs/langchain-community/src/document_loaders/tests/example_data/attention.doc
diff --git a/yarn.lock b/yarn.lock
@@ -11886,6 +11886,7 @@ __metadata:
     "@types/pg": ^8.11.0
     "@types/pg-copy-streams": ^1.2.2
     "@types/uuid": ^9
+    "@types/word-extractor": ^1
     "@types/ws": ^8
     "@typescript-eslint/eslint-plugin": ^5.58.0
     "@typescript-eslint/parser": ^5.58.0
@@ -11985,6 +11986,7 @@ __metadata:
     voy-search: 0.6.2
     weaviate-ts-client: ^1.4.0
     web-auth-library: ^1.0.3
+    word-extractor: ^1.0.4
     youtube-transcript: ^1.0.6
     youtubei.js: ^9.1.0
     zod: ^3.22.3
@@ -12113,6 +12115,7 @@ __metadata:
     voy-search: 0.6.2
     weaviate-ts-client: "*"
     web-auth-library: ^1.0.3
+    word-extractor: "*"
     ws: ^8.14.2
     youtube-transcript: ^1.0.6
     youtubei.js: ^9.1.0
@@ -12353,6 +12356,8 @@ __metadata:
       optional: true
     web-auth-library:
       optional: true
+    word-extractor:
+      optional: true
     ws:
       optional: true
     youtube-transcript:
@@ -20168,6 +20173,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@types/word-extractor@npm:^1":
+  version: 1.0.6
+  resolution: "@types/word-extractor@npm:1.0.6"
+  dependencies:
+    "@types/node": "*"
+  checksum: 3619602252493e1ad2671af6ce73a342cdc8452d5a3473123474ab6f3f5deb466b9c4cfdcbac75eacb430f545ae5708e3165b01e03b42053ca9fa87d2920fe3d
+  languageName: node
+  linkType: hard
+
 "@types/ws@npm:8.5.9":
   version: 8.5.9
   resolution: "@types/ws@npm:8.5.9"
@@ -39965,6 +39979,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"saxes@npm:^5.0.1":
+  version: 5.0.1
+  resolution: "saxes@npm:5.0.1"
+  dependencies:
+    xmlchars: ^2.2.0
+  checksum: 5636b55cf15f7cf0baa73f2797bf992bdcf75d1b39d82c0aa4608555c774368f6ac321cb641fd5f3d3ceb87805122cd47540da6a7b5960fe0dbdb8f8c263f000
+  languageName: node
+  linkType: hard
+
 "saxes@npm:^6.0.0":
   version: 6.0.0
   resolution: "saxes@npm:6.0.0"
@@ -44345,6 +44368,16 @@ __metadata:
   languageName: node
   linkType: hard
 
+"word-extractor@npm:^1.0.4":
+  version: 1.0.4
+  resolution: "word-extractor@npm:1.0.4"
+  dependencies:
+    saxes: ^5.0.1
+    yauzl: ^2.10.0
+  checksum: 04ed0ef1dfd6b26ab2094671e72f16e5a948f9978da3fd6b9d01ff475ecd048199f529d989f1d0dfe3da684a1aa8bb86e4388edabd706fd74a0be4eb030183cd
+  languageName: node
+  linkType: hard
+
 "word-wrap@npm:^1.2.3, word-wrap@npm:~1.2.3":
   version: 1.2.3
   resolution: "word-wrap@npm:1.2.3"