From 0f29cefd5bcdc3dcfdea5b9d0133ccbc1d0d5023 Mon Sep 17 00:00:00 2001 From: Nikola Zivkovic Date: Fri, 9 Sep 2022 16:02:34 +0200 Subject: [PATCH] Classes for custom vocabulary First token fix --- README.md | 5 ++++- src/BERTTokenizers.csproj | 2 +- src/Base/TokenizerBase.cs | 6 ++++-- src/BertCasedCustomVocabulary.cs | 11 +++++++++++ src/BertUncasedCustomVocabulary.cs | 10 ++++++++++ 5 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 src/BertCasedCustomVocabulary.cs create mode 100644 src/BertUncasedCustomVocabulary.cs diff --git a/README.md b/README.md index 9fbeb06..2e556df 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ While working with BERT Models from Huggingface in combination with ML.NET, I st I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).
However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary. So, I decided to extend it and publish my implementation as a NuGet package and an open-source project. -More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/) +More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/).
This repository contains tokenizers for following models:
· BERT Base
@@ -77,6 +77,8 @@ This repository contains tokenizers for following models:
· BERT Base Uncased
· BERT Large Uncased
+There are also clases using which you can upload your own vocabulary. +

(back to top)

### Built With @@ -194,6 +196,7 @@ n.zivkovic@rubikscode.net
## Acknowledgments * Gianluca Bertani - Performance Improvements +* [Paul Calot](https://github.com/PaulCalot) - First Token bugfix

(back to top)

diff --git a/src/BERTTokenizers.csproj b/src/BERTTokenizers.csproj index cf976ac..e0ab91d 100644 --- a/src/BERTTokenizers.csproj +++ b/src/BERTTokenizers.csproj @@ -21,7 +21,7 @@ · BERT Large Uncased Open-source project for BERT tokenizers that can be used in C#. BERT, Tokenizer, charp, dotnet - 1.1.0 + 1.2.0 diff --git a/src/Base/TokenizerBase.cs b/src/Base/TokenizerBase.cs index a9cd733..5a21d8f 100644 --- a/src/Base/TokenizerBase.cs +++ b/src/Base/TokenizerBase.cs @@ -2,6 +2,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text.RegularExpressions; namespace BERTTokenizers.Base { @@ -118,7 +119,7 @@ private IEnumerable SegmentIndex(List<(string token, int index)> tokens) { string prefix = null; int subwordLength = remaining.Length; - while (subwordLength >= 2) + while (subwordLength >= 1) // was initially 2, which prevents using "character encoding" { string subword = remaining.Substring(0, subwordLength); if (!_vocabularyDict.ContainsKey(subword)) @@ -138,7 +139,8 @@ private IEnumerable SegmentIndex(List<(string token, int index)> tokens) return tokens; } - remaining = remaining.Replace(prefix, "##"); + var regex = new Regex(prefix); + remaining = regex.Replace(remaining, "##", 1); tokens.Add((prefix, _vocabularyDict[prefix])); } diff --git a/src/BertCasedCustomVocabulary.cs b/src/BertCasedCustomVocabulary.cs new file mode 100644 index 0000000..308429a --- /dev/null +++ b/src/BertCasedCustomVocabulary.cs @@ -0,0 +1,11 @@ +using BERTTokenizers.Base; + +namespace BERTTokenizers +{ + public class BertUnasedCustomVocabulary : CasedTokenizer + { + public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { } + + } + +} diff --git a/src/BertUncasedCustomVocabulary.cs b/src/BertUncasedCustomVocabulary.cs new file mode 100644 index 0000000..4ed1014 --- /dev/null +++ b/src/BertUncasedCustomVocabulary.cs @@ -0,0 +1,10 @@ +using BERTTokenizers.Base; + +namespace BERTTokenizers +{ + public class BertCasedCustomVocabulary : CasedTokenizer + { + public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { } + + } +}