diff --git a/README.md b/README.md
index 9fbeb06..2e556df 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ While working with BERT Models from Huggingface in combination with ML.NET, I st
I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).
However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary.
So, I decided to extend it and publish my implementation as a NuGet package and an open-source project.
-More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)
+More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/).
This repository contains tokenizers for following models:
· BERT Base
@@ -77,6 +77,8 @@ This repository contains tokenizers for following models:
· BERT Base Uncased
· BERT Large Uncased
+There are also clases using which you can upload your own vocabulary.
+
(back to top)
### Built With
@@ -194,6 +196,7 @@ n.zivkovic@rubikscode.net
## Acknowledgments
* Gianluca Bertani - Performance Improvements
+* [Paul Calot](https://github.com/PaulCalot) - First Token bugfix
(back to top)
diff --git a/src/BERTTokenizers.csproj b/src/BERTTokenizers.csproj
index cf976ac..e0ab91d 100644
--- a/src/BERTTokenizers.csproj
+++ b/src/BERTTokenizers.csproj
@@ -21,7 +21,7 @@
· BERT Large Uncased
Open-source project for BERT tokenizers that can be used in C#.
BERT, Tokenizer, charp, dotnet
- 1.1.0
+ 1.2.0
diff --git a/src/Base/TokenizerBase.cs b/src/Base/TokenizerBase.cs
index a9cd733..5a21d8f 100644
--- a/src/Base/TokenizerBase.cs
+++ b/src/Base/TokenizerBase.cs
@@ -2,6 +2,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
+using System.Text.RegularExpressions;
namespace BERTTokenizers.Base
{
@@ -118,7 +119,7 @@ private IEnumerable SegmentIndex(List<(string token, int index)> tokens)
{
string prefix = null;
int subwordLength = remaining.Length;
- while (subwordLength >= 2)
+ while (subwordLength >= 1) // was initially 2, which prevents using "character encoding"
{
string subword = remaining.Substring(0, subwordLength);
if (!_vocabularyDict.ContainsKey(subword))
@@ -138,7 +139,8 @@ private IEnumerable SegmentIndex(List<(string token, int index)> tokens)
return tokens;
}
- remaining = remaining.Replace(prefix, "##");
+ var regex = new Regex(prefix);
+ remaining = regex.Replace(remaining, "##", 1);
tokens.Add((prefix, _vocabularyDict[prefix]));
}
diff --git a/src/BertCasedCustomVocabulary.cs b/src/BertCasedCustomVocabulary.cs
new file mode 100644
index 0000000..308429a
--- /dev/null
+++ b/src/BertCasedCustomVocabulary.cs
@@ -0,0 +1,11 @@
+using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+ public class BertUnasedCustomVocabulary : CasedTokenizer
+ {
+ public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+ }
+
+}
diff --git a/src/BertUncasedCustomVocabulary.cs b/src/BertUncasedCustomVocabulary.cs
new file mode 100644
index 0000000..4ed1014
--- /dev/null
+++ b/src/BertUncasedCustomVocabulary.cs
@@ -0,0 +1,10 @@
+using BERTTokenizers.Base;
+
+namespace BERTTokenizers
+{
+ public class BertCasedCustomVocabulary : CasedTokenizer
+ {
+ public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
+
+ }
+}