From cbbe633082fb8159749bfa5fa8289bc65f1dfb67 Mon Sep 17 00:00:00 2001
From: Guillaume Klein <guillaume.klein@systrangroup.com>
Date: Tue, 14 Feb 2023 09:34:05 +0100
Subject: [PATCH] Add num_workers parameter

---
 README.md                    | 2 ++
 faster_whisper/transcribe.py | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 616b903..0517b1d 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,8 @@ for segment in segments:
     print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
 ```
 
+See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
+
 ## Comparing performance against other implementations
 
 If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
index 81a9736..4ec28d4 100644
--- a/faster_whisper/transcribe.py
+++ b/faster_whisper/transcribe.py
@@ -49,6 +49,7 @@ class WhisperModel:
         device_index=0,
         compute_type="default",
         cpu_threads=0,
+        num_workers=1,
     ):
         """Initializes the Whisper model.
 
@@ -56,10 +57,17 @@ class WhisperModel:
           model_path: Path to the converted model.
           device: Device to use for computation ("cpu", "cuda", "auto").
           device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
           compute_type: Type to use for computation.
             See https://opennmt.net/CTranslate2/quantization.html.
           cpu_threads: Number of threads to use when running on CPU (4 by default).
             A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
         """
         self.model = ctranslate2.models.Whisper(
             model_path,
@@ -67,6 +75,7 @@ class WhisperModel:
             device_index=device_index,
             compute_type=compute_type,
             intra_threads=cpu_threads,
+            inter_threads=num_workers,
         )
 
         with open(os.path.join(model_path, "vocabulary.txt")) as vocab_file: