Skip to content
This repository has been archived by the owner on Sep 17, 2024. It is now read-only.

Commit

Permalink
add language to words
Browse files Browse the repository at this point in the history
_collate_word_timestamps uses the return_language flag to determine whether the language of the chunk should be added to the word's information
  • Loading branch information
robinderat committed Jun 24, 2024
1 parent 0dd65a0 commit 391cd45
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions src/transformers/models/whisper/tokenization_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,7 @@ def new_chunk():
chunk["text"] = resolved_text
if return_timestamps == "word":
chunk["words"] = _collate_word_timestamps(
tokenizer, resolved_tokens, resolved_token_timestamps, last_language
tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
)
chunks.append(chunk)

Expand Down Expand Up @@ -1065,7 +1065,7 @@ def new_chunk():
chunk["text"] = resolved_text
if return_timestamps == "word":
chunk["words"] = _collate_word_timestamps(
tokenizer, resolved_tokens, resolved_token_timestamps, last_language
tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
)
chunks.append(chunk)

Expand Down Expand Up @@ -1197,12 +1197,16 @@ def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
return total_sequence, []


def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language):
def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, return_language):
words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)

optional_language_field = {"language": language} if return_language else {}

timings = [
{
"text": word,
"timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
**optional_language_field
}
for word, indices in zip(words, token_indices)
]
Expand Down

0 comments on commit 391cd45

Please sign in to comment.