From 0f29cefd5bcdc3dcfdea5b9d0133ccbc1d0d5023 Mon Sep 17 00:00:00 2001
From: Nikola Zivkovic <nikola.zivkovic@argusdatainsights.ch>
Date: Fri, 9 Sep 2022 16:02:34 +0200
Subject: [PATCH] Classes for custom vocabulary First token fix

---
 README.md                          |  5 ++++-
 src/BERTTokenizers.csproj          |  2 +-
 src/Base/TokenizerBase.cs          |  6 ++++--
 src/BertCasedCustomVocabulary.cs   | 11 +++++++++++
 src/BertUncasedCustomVocabulary.cs | 10 ++++++++++
 5 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 src/BertCasedCustomVocabulary.cs
 create mode 100644 src/BertUncasedCustomVocabulary.cs
diff --git a/README.md b/README.md
index 9fbeb06..2e556df 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ While working with BERT Models from Huggingface in combination with ML.NET, I st
 I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).</br>
 However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary.
 So, I decided to extend it and publish my implementation as a NuGet package and an open-source project.
-More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)
+More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/). </br>
 
 This repository contains tokenizers for following models:<br />
     · BERT Base<br />
@@ -77,6 +77,8 @@ This repository contains tokenizers for following models:<br />
     · BERT Base Uncased<br />
     · BERT Large Uncased<br />
 
+There are also clases using which you can upload your own vocabulary.
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ### Built With
@@ -194,6 +196,7 @@ n.zivkovic@rubikscode.net</br>
 ## Acknowledgments
 
 * Gianluca Bertani - Performance Improvements
+* [Paul Calot](https://github.com/PaulCalot) - First Token bugfix
 
 <p align="right">(<a href="#top">back to top</a>)</p>
 
diff --git a/src/BERTTokenizers.csproj b/src/BERTTokenizers.csproj
index cf976ac..e0ab91d 100644
--- a/src/BERTTokenizers.csproj
+++ b/src/BERTTokenizers.csproj
@@ -21,7 +21,7 @@
     · BERT Large Uncased</Description>
     <PackageReleaseNotes>Open-source project for BERT tokenizers that can be used in C#.</PackageReleaseNotes>
     <PackageTags>BERT, Tokenizer, charp, dotnet</PackageTags>
-    <Version>1.1.0</Version>
+    <Version>1.2.0</Version>
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/src/Base/TokenizerBase.cs b/src/Base/TokenizerBase.cs
index a9cd733..5a21d8f 100644
--- a/src/Base/TokenizerBase.cs
+++ b/src/Base/TokenizerBase.cs
@@ -2,6 +2,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Text.RegularExpressions;
 
 namespace BERTTokenizers.Base
 {
@@ -118,7 +119,7 @@ private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)
             {
                 string prefix = null;
                 int subwordLength = remaining.Length;
-                while (subwordLength >= 2)
+                while (subwordLength >= 1) // was initially 2, which prevents using "character encoding"
                 {
                     string subword = remaining.Substring(0, subwordLength);
                     if (!_vocabularyDict.ContainsKey(subword))
@@ -138,7 +139,8 @@ private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)
                     return tokens;
                 }
 
-                remaining = remaining.Replace(prefix, "##");
+                var regex = new Regex(prefix);
+                remaining = regex.Replace(remaining, "##", 1);
 
                 tokens.Add((prefix, _vocabularyDict[prefix]));
             }
diff --git a/src/BertCasedCustomVocabulary.cs b/src/BertCasedCustomVocabulary.cs
new file mode 100644
index 0000000..308429a
--- /dev/null
+++ b/src/BertCasedCustomVocabulary.cs
@@ -0,0 +1,11 @@
+﻿using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+    public class BertUnasedCustomVocabulary : CasedTokenizer
+    {
+        public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+    }
+
+}
diff --git a/src/BertUncasedCustomVocabulary.cs b/src/BertUncasedCustomVocabulary.cs
new file mode 100644
index 0000000..4ed1014
--- /dev/null
+++ b/src/BertUncasedCustomVocabulary.cs
@@ -0,0 +1,10 @@
+﻿using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+    public class BertCasedCustomVocabulary : CasedTokenizer
+    {
+        public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+    }
+}