diff options
| author | Paul Buetow <paul@buetow.org> | 2025-07-15 23:28:13 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2025-07-15 23:28:13 +0300 |
| commit | 61529facc2c5321de9f0ab9123cb1de25bcab62c (patch) | |
| tree | 0768d5d5e68c71ea52fc31ca2d33950c93977314 /cmd | |
| parent | 9c77f2a7bef485fa137f123cbf55b42cacb2b285 (diff) | |
feat: remove espeak, add random voice/style selection, fix punctuation in TTS
- Removed espeak audio provider completely, now only uses OpenAI TTS
- Audio now uses random voice selection by default (can override with --openai-voice)
- Added --all-voices flag to generate audio in all 11 OpenAI voices
- Images now use random art styles (13 different styles including superhero, yoga, cat-themed)
- Fixed TTS to remove punctuation marks before speaking
- Updated Bulgarian pronunciation instructions to explicitly avoid Russian accent
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/totalrecall/main.go | 109 |
1 files changed, 54 insertions, 55 deletions
diff --git a/cmd/totalrecall/main.go b/cmd/totalrecall/main.go index df9e666..2a43d51 100644 --- a/cmd/totalrecall/main.go +++ b/cmd/totalrecall/main.go @@ -3,10 +3,12 @@ package main import ( "context" "fmt" + "math/rand" "os" "path/filepath" "sort" "strings" + "time" "github.com/sashabaranov/go-openai" "github.com/spf13/cobra" @@ -21,7 +23,7 @@ import ( var ( // Flags cfgFile string - voice string + // voice removed - was only for espeak outputDir string audioFormat string imageAPI string @@ -31,12 +33,8 @@ var ( imagesPerWord int generateAnki bool listModels bool - // Audio provider flags - audioProvider string - // Audio tuning flags (espeak) - audioPitch int - audioAmplitude int - audioWordGap int + allVoices bool + // Audio provider flags removed - now only OpenAI // OpenAI flags openAIModel string openAIVoice string @@ -55,7 +53,7 @@ var rootCmd = &cobra.Command{ Short: "Bulgarian Anki Flashcard Generator", Long: `totalrecall generates Anki flashcard materials from Bulgarian words. -It creates audio pronunciation files using espeak-ng and downloads +It creates audio pronunciation files using OpenAI TTS and downloads representative images from web search APIs. Example: @@ -69,11 +67,13 @@ Example: func init() { cobra.OnInitialize(initConfig) + // Initialize random number generator + rand.Seed(time.Now().UnixNano()) + // Global flags rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.totalrecall.yaml)") // Local flags - rootCmd.Flags().StringVarP(&voice, "voice", "v", "bg+f1", "Voice variant (bg, bg+m1, bg+f1, etc.)") rootCmd.Flags().StringVarP(&outputDir, "output", "o", "./anki_cards", "Output directory") rootCmd.Flags().StringVarP(&audioFormat, "format", "f", "mp3", "Audio format (wav or mp3)") rootCmd.Flags().StringVar(&imageAPI, "image-api", "openai", "Image source (pixabay, unsplash, or openai)") @@ -83,18 +83,13 @@ func init() { rootCmd.Flags().IntVar(&imagesPerWord, "images-per-word", 1, "Number of images to download per word") rootCmd.Flags().BoolVar(&generateAnki, "anki", false, "Generate Anki import CSV file") rootCmd.Flags().BoolVar(&listModels, "list-models", false, "List available OpenAI models for the current API key") + rootCmd.Flags().BoolVar(&allVoices, "all-voices", false, "Generate audio in all available voices (creates multiple files)") - // Audio provider selection - rootCmd.Flags().StringVar(&audioProvider, "audio-provider", "openai", "Audio provider: espeak or openai") - - // Audio tuning flags (espeak) - rootCmd.Flags().IntVar(&audioPitch, "pitch", 50, "Audio pitch adjustment (0-99, default 50, espeak only)") - rootCmd.Flags().IntVar(&audioAmplitude, "amplitude", 100, "Audio volume (0-200, default 100, espeak only)") - rootCmd.Flags().IntVar(&audioWordGap, "word-gap", 0, "Gap between words in 10ms units (default 0, espeak only)") + // Audio provider removed - now only OpenAI // OpenAI flags rootCmd.Flags().StringVar(&openAIModel, "openai-model", "gpt-4o-mini-tts", "OpenAI TTS model: tts-1, tts-1-hd, gpt-4o-mini-tts") - rootCmd.Flags().StringVar(&openAIVoice, "openai-voice", "nova", "OpenAI voice: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse") + rootCmd.Flags().StringVar(&openAIVoice, "openai-voice", "", "OpenAI voice: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse (default: random)") rootCmd.Flags().Float64Var(&openAISpeed, "openai-speed", 0.8, "OpenAI speech speed (0.25 to 4.0, may be ignored by gpt-4o-mini-tts)") rootCmd.Flags().StringVar(&openAIInstruction, "openai-instruction", "", "Voice instructions for gpt-4o-mini-tts model (e.g., 'speak slowly with a Bulgarian accent')") @@ -237,23 +232,47 @@ func processWord(word string) error { } func generateAudio(word string) error { + allVoicesList := []string{"alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"} + + // Get list of voices to use + var voices []string + if allVoices { + voices = allVoicesList + } else if openAIVoice != "" { + // Use explicitly specified voice + voices = []string{openAIVoice} + fmt.Printf(" Using specified voice: %s\n", openAIVoice) + } else { + // Select a random voice + randomVoice := allVoicesList[rand.Intn(len(allVoicesList))] + voices = []string{randomVoice} + fmt.Printf(" Using random voice: %s\n", randomVoice) + } + + // Generate audio for each voice + for i, voice := range voices { + if allVoices { + fmt.Printf(" Generating audio %d/%d (voice: %s)...\n", i+1, len(voices), voice) + } + if err := generateAudioWithVoice(word, voice); err != nil { + return fmt.Errorf("failed to generate audio with voice %s: %w", voice, err) + } + } + + return nil +} + +func generateAudioWithVoice(word, voice string) error { // Create audio provider configuration providerConfig := &audio.Config{ - Provider: audioProvider, + Provider: "openai", OutputDir: outputDir, OutputFormat: audioFormat, - // ESpeak settings - ESpeakVoice: voice, - ESpeakSpeed: viper.GetInt("audio.speed"), - ESpeakPitch: audioPitch, - ESpeakAmplitude: audioAmplitude, - ESpeakWordGap: audioWordGap, - // OpenAI settings OpenAIKey: getOpenAIKey(), OpenAIModel: openAIModel, - OpenAIVoice: openAIVoice, + OpenAIVoice: voice, OpenAISpeed: openAISpeed, OpenAIInstruction: openAIInstruction, @@ -263,26 +282,11 @@ func generateAudio(word string) error { } // Set defaults - if providerConfig.ESpeakSpeed == 0 { - providerConfig.ESpeakSpeed = 150 - } if providerConfig.CacheDir == "" { providerConfig.CacheDir = "./.audio_cache" } // Use config file values if not overridden by flags - if audioProvider == "openai" && viper.IsSet("audio.provider") { - providerConfig.Provider = viper.GetString("audio.provider") - } - if audioPitch == 50 && viper.IsSet("audio.pitch") { - providerConfig.ESpeakPitch = viper.GetInt("audio.pitch") - } - if audioAmplitude == 100 && viper.IsSet("audio.amplitude") { - providerConfig.ESpeakAmplitude = viper.GetInt("audio.amplitude") - } - if audioWordGap == 0 && viper.IsSet("audio.word_gap") { - providerConfig.ESpeakWordGap = viper.GetInt("audio.word_gap") - } if openAIModel == "gpt-4o-mini-tts" && viper.IsSet("audio.openai_model") { providerConfig.OpenAIModel = viper.GetString("audio.openai_model") } @@ -299,25 +303,20 @@ func generateAudio(word string) error { // Create the audio provider provider, err := audio.NewProvider(providerConfig) if err != nil { - // If OpenAI fails, try to create a fallback to espeak - if providerConfig.Provider == "openai" { - fmt.Printf("Warning: OpenAI audio provider failed (%v), falling back to espeak-ng\n", err) - providerConfig.Provider = "espeak" - fallbackProvider, fallbackErr := audio.NewProvider(providerConfig) - if fallbackErr != nil { - return fmt.Errorf("both OpenAI and espeak-ng failed: %v", fallbackErr) - } - provider = fallbackProvider - } else { - return err - } + return err } // Generate audio file + ctx := context.Background() filename := sanitizeFilename(word) - outputFile := filepath.Join(outputDir, fmt.Sprintf("%s.%s", filename, audioFormat)) - ctx := context.Background() + // Add voice name to filename if generating multiple voices + if allVoices { + outputFile := filepath.Join(outputDir, fmt.Sprintf("%s_%s.%s", filename, voice, audioFormat)) + return provider.GenerateAudio(ctx, word, outputFile) + } + + outputFile := filepath.Join(outputDir, fmt.Sprintf("%s.%s", filename, audioFormat)) return provider.GenerateAudio(ctx, word, outputFile) } |
