ソースを参照

feat(whisper): use whisper-cpp and ollama to do local speech to text magic

Joe 9 ヶ月 前
コミット
65db82b2c9
2 ファイル変更176 行追加0 行削除
  1. 1 0
      .scripts/omnibar.zsh
  2. 175 0
      .scripts/whisper.zsh

+ 1 - 0
.scripts/omnibar.zsh

@@ -23,6 +23,7 @@ tmux display-menu -t . \
     "Zoom Pane" "z" "resize-pane -Z" \
     ""  \
     "Screenshot" "+" "run-shell \"/bin/zsh -l $DOTFILES_DIR/.scripts/tmux_screencap.zsh\"" \
+    "Whisper" "w" "popup -E -w 82 -h 40 whisper.zsh stream" \
     ""  \
     "New Window" "c" "new-window /bin/zsh $DOTFILES_DIR/.scripts/nvim_launch.zsh" \
     "Close Pane" "x" "kill-pane" \

+ 175 - 0
.scripts/whisper.zsh

@@ -0,0 +1,175 @@
+#!/bin/zsh
+
+export WHISPER_PATH="${HOME}/.whisper"
+export WHISPER_MODELS=( 'medium.en' 'tiny.en' 'small.en' 'large-v3.turbo' )
+export WHISPER_MODEL=${WHISPER_MODELS[@]:0:1}
+
+cleanup() {
+    tput cnorm
+}
+
+trap cleanup EXIT
+
+tput civis
+
+function build_model {
+    cd $WHISPER_PATH
+    make -j "$1"
+}
+
+function build_whisper {
+    cd $WHISPER_PATH
+    for model in $WHISPER_MODELS; do
+        build_model $model
+    done
+    cmake -B build -DWHISPER_SDL2=ON
+    cmake --build build --config Release
+}
+
+if [ ! -d "$WHISPER_PATH" ]; then
+    if [[ "$1" == "build" ]]; then
+        git clone git@github.com:ggerganov/whisper.cpp.git --depth 1 "$WHISPER_PATH"
+        build_whisper
+        exit 0
+    else
+        echo "$WHISPER_PATH does not exist; run \`whisper.zsh build\` to set up."
+        exit 1
+    fi
+fi
+
+if [[ "$1" == "update" ]]; then
+    cd $WHISPER_PATH
+    git pull
+    rm -rf build
+    build_whisper
+    exit 0
+fi
+
+if [[ "$1" == "stream" ]]; then
+    retry=true
+    while $retry; do;
+        # clear
+        whisper_content_write_tmp_file="$(mktemp)"
+        whisper_content_read_tmp_file="$(mktemp)"
+        ollama_prompt_file="$(mktemp)"
+        whisper_pid_tmp_file="$(mktemp)"
+        $WHISPER_PATH/build/bin/whisper-stream --keep-context --flash-attn --length 10000 --step 2500 --keep 1000 --threads 4 --model "$WHISPER_PATH/models/ggml-${WHISPER_MODEL}.bin" 2>/dev/null > "$whisper_content_write_tmp_file" & echo $! >> "$whisper_pid_tmp_file"
+        whisper_pid=$(cat "$whisper_pid_tmp_file")
+        rm "$whisper_pid_tmp_file"
+
+        oldselection=""
+        selection=""
+        killed=false
+        while true; do;
+            cp "$whisper_content_write_tmp_file" "$whisper_content_read_tmp_file"
+            gsed -i 's/^.*\[2K\r//g;s/^ //g;s/^\[Start speaking\]$//g;/^[ ,.\-\_\*]*$/d;' "$whisper_content_read_tmp_file"
+            clear
+            if $killed; then;
+                printf "Status: \e[31mStopped\e[0m"
+            else
+                printf "Status: \e[32mRunning $()\e[0m"
+            fi;
+            printf "\nAction:   a) Save        "
+            if [[ "$selection" == "r" ]]; then
+                printf "\e[31mr)\e[0m"
+            else
+                printf "r)"
+            fi
+            printf " Copy        s) Retry       t) Quit\nRephrase: "
+            if [[ "$selection" == "n" ]]; then
+                printf "\e[31mn)\e[0m"
+            else
+                printf "n)"
+            fi
+            printf " Normal      "
+            if [[ "$selection" == "e" ]]; then
+                printf "\e[31me)\e[0m"
+            else
+                printf "e)"
+            fi
+            printf " Prompt      "
+            if [[ "$selection" == "i" ]]; then
+                printf "\e[31mi)\e[0m"
+            else
+                printf "i)"
+            fi
+            printf " Programmer  "
+            if [[ "$selection" == "o" ]]; then
+                printf "\e[31mo)\e[0m"
+            else
+                printf "o)"
+            fi
+            printf " Prose\n\n"
+
+            echo "$(fold -w 80 -s $whisper_content_read_tmp_file)"
+
+            read -k1 -s -t 0.1 selection
+            case $selection in
+                "")
+                    ;;
+                *)
+                    kill -9 $whisper_pid &>/dev/null
+                    killed=true
+                    if [[ "$oldselection" == "$selection" ]]; then
+                        continue
+                    fi;
+                    oldselection="$selection"
+                    case $selection in
+                        a)
+                            saved="$(mktemp)"
+                            cp "$whisper_content_read_tmp_file" "$saved";
+                            echo "$saved";
+                            retry=false;
+                            break;
+                            ;;
+                        r)
+                            cat "$whisper_content_read_tmp_file" | pbcopy;
+                            ;;
+                        s)
+                            break;
+                            ;;
+                        t)
+                            retry=false;
+                            break;
+                            ;;
+                        n|e|i|o)
+                            echo "
+                                You are an expert in copy editing and audio transcription. Your assignment is to expertly improve the quality of audio transcriptions.
+                                The rules of the task are as follows:
+                                - Keep the transcription largely unaltered. You may remove or replace text when the speaker has corrected or repeated themselves.
+                                - Respond only with the cleaned-up transcription. Your own notes, labels, and descriptions are strictly NOT allowed! Headings and labels are NEVER allowed!
+                                - Small changes to punctuation and grammar are permitted, so long as they stay true to the original tone of the transcription.
+                                - Do NOT confirm the request, instead go straight to the cleaned-up transcription!
+                                - Prioritize delivering the transcription. Avoid non-essential comments or filler phrases such as 'Here's the...', or labels such as 'Transcription:'.
+                                - The speaker may give directions such as 'scratch that' or 'actually, I mean...', indicating that previous text should be replaced. Pay attention to these directions and edit the dictation accordingly.
+                                - Use Markdown for your output.
+                                The audio transcription you will receive is from a single speaker dictating into his laptop microphone.
+                            " > "$ollama_prompt_file"
+                            case $selection in
+                                e)
+                                    echo "The speaker is dictating a text prompt to be used for a Large Language model. Therefore, this transcription must be formatted with the aim of being an excellent LLM prompt." >> "$ollama_prompt_file"
+                                    ;;
+                                i)
+                                    echo "The speaker is a programmer who is dictating a text prompt to be used for a Large Language model. Therefore, this transcription must be formatted with the aim of being an excellent LLM prompt. The context of the dictation is related to programming, so be sure to format parts of the text that appear to be code accordingly." >> "$ollama_prompt_file"
+                                    ;;
+                                o)
+                                    echo "The speaker is a professional who is writing prose. Take this into account and ensure that the result is professionally formatted with correct grammar and tone." >> "$ollama_prompt_file"
+                                    ;;
+                            esac;
+                            echo "
+                                The transcription to be clean up is as follows:\n\n
+                            " >> "$ollama_prompt_file"
+                            gsed -i 's/^ +//g;/^ *$/d' "$whisper_content_read_tmp_file"
+                            cat "$whisper_content_read_tmp_file" >> "$ollama_prompt_file"
+                            ollama run gemma3:4b --keepalive=30s "$(cat $ollama_prompt_file)" 2>/dev/null > $whisper_content_write_tmp_file &
+                            ;;
+                    esac;
+                ;;
+            esac;
+        done;
+        rm "$whisper_content_write_tmp_file"
+        rm "$whisper_content_read_tmp_file"
+        rm "$ollama_prompt_file"
+    done;
+    exit 0
+fi;