diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | README.md | 91 | ||||
| -rw-r--r-- | TODO.md | 17 | ||||
| -rw-r--r-- | cmd/totalrecall/main.go | 504 | ||||
| -rw-r--r-- | internal/audio/provider.go | 2 | ||||
| -rw-r--r-- | internal/image/openai.go | 329 | ||||
| -rw-r--r-- | internal/image/openai_test.go | 280 | ||||
| -rw-r--r-- | internal/version.go | 2 |
8 files changed, 1207 insertions, 19 deletions
@@ -43,3 +43,4 @@ anki_cards # Configuration with API keys .totalrecall.yaml +.image_cache/ @@ -1,13 +1,18 @@ # totalrecall - Bulgarian Anki Flashcard Generator -`totalrecall` is a command-line tool that generates Anki flashcard materials from Bulgarian words. It creates audio pronunciation files using espeak-ng or OpenAI TTS and downloads representative images from web search APIs. +`totalrecall` is a command-line tool that generates Anki flashcard materials from Bulgarian words. It creates audio pronunciation files and generates images using AI. + +⚠️ **Important:** This tool uses OpenAI services by default, which requires an API key. See [Quick Start](#quick-start) for setup instructions or use the free alternatives with `--audio-provider espeak --image-api pixabay`. ## Features - Audio generation with multiple providers: - **espeak-ng**: Free, offline Bulgarian voices (robotic quality) - **OpenAI TTS**: High-quality, natural-sounding voices (requires API key) -- Image search via Pixabay and Unsplash APIs +- Image search and generation: + - **Pixabay**: Free stock photo search (optional API key) + - **Unsplash**: High-quality photo search (requires API key) + - **OpenAI DALL-E**: AI-generated educational images (requires API key) - Batch processing of multiple words - Anki-compatible CSV export - Configurable voice variants and speech speed @@ -59,17 +64,27 @@ go install codeberg.org/snonux/totalrecall/cmd/totalrecall@latest ## Quick Start -1. Generate materials for a single word: +**Note:** By default, totalrecall uses OpenAI for both audio and images. Make sure to set your OpenAI API key: +```bash +export OPENAI_API_KEY="sk-..." +``` + +1. Generate materials for a single word (uses OpenAI by default): ```bash totalrecall ябълка ``` -2. Process multiple words from a file: +2. Use free alternatives (espeak + pixabay): + ```bash + totalrecall ябълка --audio-provider espeak --image-api pixabay + ``` + +3. Process multiple words from a file: ```bash totalrecall --batch words.txt ``` -3. Generate with Anki CSV: +4. Generate with Anki CSV: ```bash totalrecall ябълка --anki ``` @@ -80,7 +95,7 @@ Create a `.totalrecall.yaml` file in your home directory or project folder: ```yaml audio: - provider: openai # Audio provider (espeak or openai) + provider: openai # Audio provider (espeak or openai) - default: openai format: mp3 # Audio format (wav or mp3) # ESpeak settings @@ -99,10 +114,19 @@ audio: cache_dir: "./.audio_cache" image: - provider: pixabay # Image provider (pixabay or unsplash) + provider: openai # Image provider (pixabay, unsplash, or openai) - default: openai pixabay_key: "" # Optional API key for higher limits unsplash_key: "" # Required for Unsplash - size: medium # Image size preference + + # OpenAI DALL-E settings + openai_model: "dall-e-2" # Model: dall-e-2 or dall-e-3 + openai_size: "512x512" # Size: 256x256, 512x512, 1024x1024 + openai_quality: "standard" # Quality: standard or hd (dall-e-3 only) + openai_style: "natural" # Style: natural or vivid (dall-e-3 only) + + # Caching + enable_cache: true + cache_dir: "./.image_cache" output: directory: ./anki_cards @@ -125,21 +149,27 @@ totalrecall [word] [flags] - `--skip-audio`: Skip audio generation - `--skip-images`: Skip image download - `--images-per-word int`: Number of images per word (default 1) -- `--image-api string`: Image source - pixabay or unsplash (default "pixabay") +- `--image-api string`: Image source - pixabay, unsplash, or openai (default "openai") #### Audio Provider Options -- `--audio-provider string`: Audio provider - espeak or openai (default "espeak") +- `--audio-provider string`: Audio provider - espeak or openai (default "openai") #### ESpeak Tuning Options - `--pitch int`: Pitch adjustment 0-99 (default 50, lower=deeper, espeak only) - `--amplitude int`: Volume 0-200 (default 100, espeak only) - `--word-gap int`: Gap between words in 10ms units (default 0, espeak only) -#### OpenAI Options +#### OpenAI Audio Options - `--openai-model string`: Model - tts-1 or tts-1-hd (default "tts-1") - `--openai-voice string`: Voice - alloy, echo, fable, onyx, nova, shimmer (default "nova") - `--openai-speed float`: Speech speed 0.25-4.0 (default 1.0) +#### OpenAI Image Options +- `--openai-image-model string`: Model - dall-e-2 or dall-e-3 (default "dall-e-2") +- `--openai-image-size string`: Size - 256x256, 512x512, 1024x1024 (default "512x512") +- `--openai-image-quality string`: Quality - standard or hd (default "standard", dall-e-3 only) +- `--openai-image-style string`: Style - natural or vivid (default "natural", dall-e-3 only) + ## API Keys ### Pixabay @@ -150,15 +180,21 @@ totalrecall [word] [flags] - Required for Unsplash searches - Get your key at: https://unsplash.com/developers +### OpenAI +- Required for both OpenAI TTS audio and DALL-E image generation +- Get your key at: https://platform.openai.com/api-keys +- Set via environment variable: `export OPENAI_API_KEY="sk-..."` +- Or add to config file as `audio.openai_key` + ## Examples ### Basic Usage ```bash -# Single word with espeak-ng +# Single word (uses OpenAI by default) totalrecall котка -# Using OpenAI TTS (requires API key in config) -totalrecall котка --audio-provider openai +# Using espeak-ng (free alternative) +totalrecall котка --audio-provider espeak # High-quality OpenAI with specific voice totalrecall ябълка --audio-provider openai --openai-model tts-1-hd --openai-voice alloy @@ -174,6 +210,15 @@ totalrecall куче --skip-images # Generate Anki import file totalrecall --batch words.txt --anki + +# Generate AI images with OpenAI DALL-E +totalrecall ябълка --image-api openai + +# High-quality DALL-E 3 images +totalrecall котка --image-api openai --openai-image-model dall-e-3 --openai-image-quality hd + +# Combine OpenAI audio and images +totalrecall куче --audio-provider openai --image-api openai ``` ### Batch File Format @@ -213,9 +258,23 @@ Make sure espeak-ng is installed and in your PATH. ### OpenAI API errors - Verify your API key is correct and has credits -- Check the API key has TTS permissions enabled + +## Cost Considerations + +### OpenAI Services +- **TTS Audio**: ~$0.015 per 1K characters (tts-1), ~$0.030 (tts-1-hd) +- **DALL-E 2 Images**: ~$0.02 per image (512x512) +- **DALL-E 3 Images**: ~$0.04 per image (standard), ~$0.08 (HD) +- Both services cache results to avoid regenerating identical content + +### Free Alternatives +- **Audio**: Use espeak-ng (free but robotic quality) +- **Images**: Use Pixabay without API key (limited rate) + +### OpenAI Troubleshooting +- Check the API key has proper permissions enabled - If you get rate limit errors, wait a moment and try again -- The tool will automatically fall back to espeak-ng if OpenAI fails +- The tool will automatically fall back to espeak-ng if OpenAI audio fails ### Audio sounds robotic The Bulgarian voice in espeak-ng can sound robotic. To improve quality: @@ -1,3 +1,18 @@ # TODO's -1. [ ] Ultra think about an Implementation of using OpenAPI key to use an OpenAI LLM to generate an image for the flash card. And add all to-do's into this file. +## Completed +1. [x] Implement OpenAI DALL-E image generation for flashcards + - [x] Create OpenAI image provider implementing ImageSearcher interface + - [x] Add configuration flags for DALL-E model, size, quality, and style + - [x] Implement caching mechanism to avoid regenerating identical images + - [x] Create educational prompt generation for language learning + - [x] Add OpenAI provider to image download workflow + - [x] Update documentation with examples and configuration + +## In Progress / Remaining +1. [ ] Write unit tests for OpenAI image provider +2. [ ] Add cost estimation warnings in output (show estimated API costs) +3. [ ] Test with common Bulgarian words (ябълка, котка, куче, хляб) +4. [ ] Consider adding batch image generation for cost optimization +5. [ ] Add image style presets for different learning contexts (e.g., children, adults) +6. [ ] Implement fallback from OpenAI to other providers on failure diff --git a/cmd/totalrecall/main.go b/cmd/totalrecall/main.go new file mode 100644 index 0000000..f6f666a --- /dev/null +++ b/cmd/totalrecall/main.go @@ -0,0 +1,504 @@ +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + + "github.com/spf13/cobra" + "github.com/spf13/viper" + + "codeberg.org/snonux/totalrecall/internal" + "codeberg.org/snonux/totalrecall/internal/anki" + "codeberg.org/snonux/totalrecall/internal/audio" + "codeberg.org/snonux/totalrecall/internal/image" +) + +var ( + // Flags + cfgFile string + voice string + outputDir string + audioFormat string + imageAPI string + batchFile string + skipAudio bool + skipImages bool + imagesPerWord int + generateAnki bool + // Audio provider flags + audioProvider string + // Audio tuning flags (espeak) + audioPitch int + audioAmplitude int + audioWordGap int + // OpenAI flags + openAIModel string + openAIVoice string + openAISpeed float64 + // OpenAI Image flags + openAIImageModel string + openAIImageSize string + openAIImageQuality string + openAIImageStyle string +) + +// rootCmd represents the base command when called without any subcommands +var rootCmd = &cobra.Command{ + Use: "totalrecall [word]", + Short: "Bulgarian Anki Flashcard Generator", + Long: `totalrecall generates Anki flashcard materials from Bulgarian words. + +It creates audio pronunciation files using espeak-ng and downloads +representative images from web search APIs. + +Example: + totalrecall ябълка # Generate materials for "apple" + totalrecall --batch words.txt # Process multiple words from file`, + Args: cobra.MaximumNArgs(1), + RunE: runCommand, + Version: internal.Version, +} + +func init() { + cobra.OnInitialize(initConfig) + + // Global flags + rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.totalrecall.yaml)") + + // Local flags + rootCmd.Flags().StringVarP(&voice, "voice", "v", "bg+f1", "Voice variant (bg, bg+m1, bg+f1, etc.)") + rootCmd.Flags().StringVarP(&outputDir, "output", "o", "./anki_cards", "Output directory") + rootCmd.Flags().StringVarP(&audioFormat, "format", "f", "mp3", "Audio format (wav or mp3)") + rootCmd.Flags().StringVar(&imageAPI, "image-api", "openai", "Image source (pixabay, unsplash, or openai)") + rootCmd.Flags().StringVar(&batchFile, "batch", "", "Process words from file (one per line)") + rootCmd.Flags().BoolVar(&skipAudio, "skip-audio", false, "Skip audio generation") + rootCmd.Flags().BoolVar(&skipImages, "skip-images", false, "Skip image download") + rootCmd.Flags().IntVar(&imagesPerWord, "images-per-word", 1, "Number of images to download per word") + rootCmd.Flags().BoolVar(&generateAnki, "anki", false, "Generate Anki import CSV file") + + // Audio provider selection + rootCmd.Flags().StringVar(&audioProvider, "audio-provider", "openai", "Audio provider: espeak or openai") + + // Audio tuning flags (espeak) + rootCmd.Flags().IntVar(&audioPitch, "pitch", 50, "Audio pitch adjustment (0-99, default 50, espeak only)") + rootCmd.Flags().IntVar(&audioAmplitude, "amplitude", 100, "Audio volume (0-200, default 100, espeak only)") + rootCmd.Flags().IntVar(&audioWordGap, "word-gap", 0, "Gap between words in 10ms units (default 0, espeak only)") + + // OpenAI flags + rootCmd.Flags().StringVar(&openAIModel, "openai-model", "tts-1", "OpenAI model: tts-1 or tts-1-hd") + rootCmd.Flags().StringVar(&openAIVoice, "openai-voice", "nova", "OpenAI voice: alloy, echo, fable, onyx, nova, shimmer") + rootCmd.Flags().Float64Var(&openAISpeed, "openai-speed", 1.0, "OpenAI speech speed (0.25 to 4.0)") + + // OpenAI Image Generation flags + rootCmd.Flags().StringVar(&openAIImageModel, "openai-image-model", "dall-e-2", "OpenAI image model: dall-e-2 or dall-e-3") + rootCmd.Flags().StringVar(&openAIImageSize, "openai-image-size", "512x512", "Image size: 256x256, 512x512, 1024x1024 (dall-e-3: also 1024x1792, 1792x1024)") + rootCmd.Flags().StringVar(&openAIImageQuality, "openai-image-quality", "standard", "Image quality: standard or hd (dall-e-3 only)") + rootCmd.Flags().StringVar(&openAIImageStyle, "openai-image-style", "natural", "Image style: natural or vivid (dall-e-3 only)") + + // Bind flags to viper + viper.BindPFlag("audio.provider", rootCmd.Flags().Lookup("audio-provider")) + viper.BindPFlag("audio.voice", rootCmd.Flags().Lookup("voice")) + viper.BindPFlag("audio.format", rootCmd.Flags().Lookup("format")) + viper.BindPFlag("audio.pitch", rootCmd.Flags().Lookup("pitch")) + viper.BindPFlag("audio.amplitude", rootCmd.Flags().Lookup("amplitude")) + viper.BindPFlag("audio.word_gap", rootCmd.Flags().Lookup("word-gap")) + viper.BindPFlag("audio.openai_model", rootCmd.Flags().Lookup("openai-model")) + viper.BindPFlag("audio.openai_voice", rootCmd.Flags().Lookup("openai-voice")) + viper.BindPFlag("audio.openai_speed", rootCmd.Flags().Lookup("openai-speed")) + viper.BindPFlag("output.directory", rootCmd.Flags().Lookup("output")) + viper.BindPFlag("image.provider", rootCmd.Flags().Lookup("image-api")) + // Bind OpenAI image flags + viper.BindPFlag("image.openai_model", rootCmd.Flags().Lookup("openai-image-model")) + viper.BindPFlag("image.openai_size", rootCmd.Flags().Lookup("openai-image-size")) + viper.BindPFlag("image.openai_quality", rootCmd.Flags().Lookup("openai-image-quality")) + viper.BindPFlag("image.openai_style", rootCmd.Flags().Lookup("openai-image-style")) +} + +func initConfig() { + if cfgFile != "" { + // Use config file from the flag + viper.SetConfigFile(cfgFile) + } else { + // Find home directory + home, err := os.UserHomeDir() + cobra.CheckErr(err) + + // Search config in home directory with name ".totalrecall" (without extension) + viper.AddConfigPath(home) + viper.AddConfigPath(".") + viper.SetConfigType("yaml") + viper.SetConfigName(".totalrecall") + } + + // Environment variables + viper.SetEnvPrefix("TOTALRECALL") + viper.AutomaticEnv() + + // Read config file + if err := viper.ReadInConfig(); err == nil { + fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed()) + } +} + +func runCommand(cmd *cobra.Command, args []string) error { + // Determine words to process + var words []string + + if batchFile != "" { + // Read words from file + content, err := os.ReadFile(batchFile) + if err != nil { + return fmt.Errorf("failed to read batch file: %w", err) + } + // Split by newlines and filter empty lines + lines := string(content) + for _, line := range splitLines(lines) { + if line = trimSpace(line); line != "" { + words = append(words, line) + } + } + } else if len(args) > 0 { + // Single word from command line + words = []string{args[0]} + } else { + // No input provided + return fmt.Errorf("please provide a Bulgarian word or use --batch flag") + } + + // Validate words + for _, word := range words { + if err := audio.ValidateBulgarianText(word); err != nil { + return fmt.Errorf("invalid word '%s': %w", word, err) + } + } + + // Create output directory + if err := os.MkdirAll(outputDir, 0755); err != nil { + return fmt.Errorf("failed to create output directory: %w", err) + } + + // Process each word + for i, word := range words { + fmt.Printf("\nProcessing %d/%d: %s\n", i+1, len(words), word) + + if err := processWord(word); err != nil { + fmt.Fprintf(os.Stderr, "Error processing '%s': %v\n", word, err) + // Continue with next word + } + } + + // Generate Anki CSV if requested + if generateAnki { + fmt.Printf("\nGenerating Anki import file...\n") + if err := generateAnkiCSV(); err != nil { + fmt.Fprintf(os.Stderr, "Warning: Failed to generate Anki CSV: %v\n", err) + } else { + fmt.Println("Anki import file created: anki_import.csv") + } + } + + fmt.Println("\nDone! Materials saved to:", outputDir) + return nil +} + +func processWord(word string) error { + // Generate audio + if !skipAudio { + fmt.Printf(" Generating audio...\n") + if err := generateAudio(word); err != nil { + return fmt.Errorf("audio generation failed: %w", err) + } + } + + // Download images + if !skipImages { + fmt.Printf(" Downloading images...\n") + if err := downloadImages(word); err != nil { + return fmt.Errorf("image download failed: %w", err) + } + } + + return nil +} + +func generateAudio(word string) error { + // Create audio provider configuration + providerConfig := &audio.Config{ + Provider: audioProvider, + OutputDir: outputDir, + OutputFormat: audioFormat, + + // ESpeak settings + ESpeakVoice: voice, + ESpeakSpeed: viper.GetInt("audio.speed"), + ESpeakPitch: audioPitch, + ESpeakAmplitude: audioAmplitude, + ESpeakWordGap: audioWordGap, + + // OpenAI settings + OpenAIKey: getOpenAIKey(), + OpenAIModel: openAIModel, + OpenAIVoice: openAIVoice, + OpenAISpeed: openAISpeed, + + // Caching + EnableCache: viper.GetBool("audio.enable_cache"), + CacheDir: viper.GetString("audio.cache_dir"), + } + + // Set defaults + if providerConfig.ESpeakSpeed == 0 { + providerConfig.ESpeakSpeed = 150 + } + if providerConfig.CacheDir == "" { + providerConfig.CacheDir = "./.audio_cache" + } + + // Use config file values if not overridden by flags + if audioProvider == "openai" && viper.IsSet("audio.provider") { + providerConfig.Provider = viper.GetString("audio.provider") + } + if audioPitch == 50 && viper.IsSet("audio.pitch") { + providerConfig.ESpeakPitch = viper.GetInt("audio.pitch") + } + if audioAmplitude == 100 && viper.IsSet("audio.amplitude") { + providerConfig.ESpeakAmplitude = viper.GetInt("audio.amplitude") + } + if audioWordGap == 0 && viper.IsSet("audio.word_gap") { + providerConfig.ESpeakWordGap = viper.GetInt("audio.word_gap") + } + if openAIModel == "tts-1" && viper.IsSet("audio.openai_model") { + providerConfig.OpenAIModel = viper.GetString("audio.openai_model") + } + if openAIVoice == "nova" && viper.IsSet("audio.openai_voice") { + providerConfig.OpenAIVoice = viper.GetString("audio.openai_voice") + } + if openAISpeed == 1.0 && viper.IsSet("audio.openai_speed") { + providerConfig.OpenAISpeed = viper.GetFloat64("audio.openai_speed") + } + + // Create the audio provider + provider, err := audio.NewProvider(providerConfig) + if err != nil { + // If OpenAI fails, try to create a fallback to espeak + if providerConfig.Provider == "openai" { + fmt.Printf("Warning: OpenAI audio provider failed (%v), falling back to espeak-ng\n", err) + providerConfig.Provider = "espeak" + fallbackProvider, fallbackErr := audio.NewProvider(providerConfig) + if fallbackErr != nil { + return fmt.Errorf("both OpenAI and espeak-ng failed: %v", fallbackErr) + } + provider = fallbackProvider + } else { + return err + } + } + + // Generate audio file + filename := sanitizeFilename(word) + outputFile := filepath.Join(outputDir, fmt.Sprintf("%s.%s", filename, audioFormat)) + + ctx := context.Background() + return provider.GenerateAudio(ctx, word, outputFile) +} + +func downloadImages(word string) error { + // Create image searcher based on provider + var searcher image.ImageSearcher + var err error + + switch imageAPI { + case "pixabay": + apiKey := viper.GetString("image.pixabay_key") + searcher = image.NewPixabayClient(apiKey) + + case "unsplash": + apiKey := viper.GetString("image.unsplash_key") + if apiKey == "" { + return fmt.Errorf("Unsplash API key is required in config") + } + searcher, err = image.NewUnsplashClient(apiKey) + if err != nil { + return err + } + + case "openai": + // Create OpenAI image configuration + openaiConfig := &image.OpenAIConfig{ + APIKey: getOpenAIKey(), + Model: openAIImageModel, + Size: openAIImageSize, + Quality: openAIImageQuality, + Style: openAIImageStyle, + CacheDir: viper.GetString("image.cache_dir"), + EnableCache: viper.GetBool("image.enable_cache"), + } + + // Use config file values if not overridden by flags + if openAIImageModel == "dall-e-2" && viper.IsSet("image.openai_model") { + openaiConfig.Model = viper.GetString("image.openai_model") + } + if openAIImageSize == "512x512" && viper.IsSet("image.openai_size") { + openaiConfig.Size = viper.GetString("image.openai_size") + } + if openAIImageQuality == "standard" && viper.IsSet("image.openai_quality") { + openaiConfig.Quality = viper.GetString("image.openai_quality") + } + if openAIImageStyle == "natural" && viper.IsSet("image.openai_style") { + openaiConfig.Style = viper.GetString("image.openai_style") + } + + // Set defaults + if openaiConfig.CacheDir == "" { + openaiConfig.CacheDir = "./.image_cache" + } + if !viper.IsSet("image.enable_cache") { + openaiConfig.EnableCache = true + } + + searcher = image.NewOpenAIClient(openaiConfig) + if openaiConfig.APIKey == "" { + fmt.Printf("Warning: OpenAI API key not found, falling back to Pixabay for images\n") + imageAPI = "pixabay" + searcher = image.NewPixabayClient("") + } + + default: + return fmt.Errorf("unknown image provider: %s", imageAPI) + } + + // Create downloader + downloadOpts := &image.DownloadOptions{ + OutputDir: outputDir, + OverwriteExisting: false, + CreateDir: true, + FileNamePattern: "{word}_{index}", + MaxSizeBytes: 5 * 1024 * 1024, // 5MB + } + + downloader := image.NewDownloader(searcher, downloadOpts) + + // Download images + ctx := context.Background() + if imagesPerWord == 1 { + _, path, err := downloader.DownloadBestMatch(ctx, word) + if err != nil { + return err + } + fmt.Printf(" Downloaded: %s\n", path) + } else { + paths, err := downloader.DownloadMultiple(ctx, word, imagesPerWord) + if err != nil { + return err + } + for _, path := range paths { + fmt.Printf(" Downloaded: %s\n", path) + } + } + + return nil +} + +func sanitizeFilename(s string) string { + // Simple filename sanitization + result := "" + for _, r := range s { + if isAlphaNumeric(r) || r == '-' || r == '_' { + result += string(r) + } else { + result += "_" + } + } + return result +} + +func isAlphaNumeric(r rune) bool { + return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || + (r >= '0' && r <= '9') || (r >= 'а' && r <= 'я') || + (r >= 'А' && r <= 'Я') +} + +func splitLines(s string) []string { + // Simple line splitter + var lines []string + current := "" + for _, r := range s { + if r == '\n' { + lines = append(lines, current) + current = "" + } else if r != '\r' { + current += string(r) + } + } + if current != "" { + lines = append(lines, current) + } + return lines +} + +func trimSpace(s string) string { + // Simple trim implementation + start := 0 + end := len(s) + + // Trim from start + for start < end && isSpace(rune(s[start])) { + start++ + } + + // Trim from end + for end > start && isSpace(rune(s[end-1])) { + end-- + } + + return s[start:end] +} + +func isSpace(r rune) bool { + return r == ' ' || r == '\t' || r == '\n' || r == '\r' +} + +func generateAnkiCSV() error { + // Create Anki generator + gen := anki.NewGenerator(&anki.GeneratorOptions{ + OutputPath: filepath.Join(outputDir, "anki_import.csv"), + MediaFolder: outputDir, + IncludeHeaders: true, + AudioFormat: audioFormat, + }) + + // Generate cards from output directory + if err := gen.GenerateFromDirectory(outputDir); err != nil { + return fmt.Errorf("failed to generate cards: %w", err) + } + + // Generate CSV + if err := gen.GenerateCSV(); err != nil { + return fmt.Errorf("failed to generate CSV: %w", err) + } + + // Print stats + total, withAudio, withImages := gen.Stats() + fmt.Printf(" Generated %d cards (%d with audio, %d with images)\n", + total, withAudio, withImages) + + return nil +} + +func getOpenAIKey() string { + // First check environment variable + if key := os.Getenv("OPENAI_API_KEY"); key != "" { + return key + } + + // Then check config file + return viper.GetString("audio.openai_key") +} + +func main() { + if err := rootCmd.Execute(); err != nil { + os.Exit(1) + } +}
\ No newline at end of file diff --git a/internal/audio/provider.go b/internal/audio/provider.go index 5b8c336..c803b61 100644 --- a/internal/audio/provider.go +++ b/internal/audio/provider.go @@ -44,7 +44,7 @@ type Config struct { // DefaultConfig returns default configuration func DefaultProviderConfig() *Config { return &Config{ - Provider: "espeak", + Provider: "openai", OutputDir: "./", OutputFormat: "mp3", ESpeakVoice: "bg", diff --git a/internal/image/openai.go b/internal/image/openai.go new file mode 100644 index 0000000..a5a3e31 --- /dev/null +++ b/internal/image/openai.go @@ -0,0 +1,329 @@ +package image + +import ( + "context" + "crypto/md5" + "encoding/hex" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strings" + + "github.com/sashabaranov/go-openai" +) + +// OpenAIClient implements ImageSearcher for OpenAI DALL-E image generation +type OpenAIClient struct { + client *openai.Client + apiKey string + model string // dall-e-2 or dall-e-3 + size string // 256x256, 512x512, 1024x1024 + quality string // standard or hd (dall-e-3 only) + style string // natural or vivid (dall-e-3 only) + cacheDir string + enableCache bool +} + +// OpenAIConfig holds configuration for the OpenAI image provider +type OpenAIConfig struct { + APIKey string + Model string + Size string + Quality string + Style string + CacheDir string + EnableCache bool +} + +// NewOpenAIClient creates a new OpenAI DALL-E client +func NewOpenAIClient(config *OpenAIConfig) *OpenAIClient { + if config.APIKey == "" { + // Return nil client that will fail on operations + return &OpenAIClient{} + } + + client := openai.NewClient(config.APIKey) + + // Set defaults + if config.Model == "" { + config.Model = "dall-e-2" + } + if config.Size == "" { + config.Size = "512x512" + } + if config.Quality == "" { + config.Quality = "standard" + } + if config.Style == "" { + config.Style = "natural" + } + if config.CacheDir == "" { + config.CacheDir = "./.image_cache" + } + + oc := &OpenAIClient{ + client: client, + apiKey: config.APIKey, + model: config.Model, + size: config.Size, + quality: config.Quality, + style: config.Style, + cacheDir: config.CacheDir, + enableCache: config.EnableCache, + } + + // Create cache directory if caching is enabled + if oc.enableCache && oc.cacheDir != "" { + os.MkdirAll(oc.cacheDir, 0755) + } + + return oc +} + +// Search generates an image for the Bulgarian word using DALL-E +func (c *OpenAIClient) Search(ctx context.Context, opts *SearchOptions) ([]SearchResult, error) { + if c.client == nil { + return nil, &SearchError{ + Provider: "openai", + Code: "NO_API_KEY", + Message: "OpenAI API key not configured", + } + } + + // Check cache first + if c.enableCache { + cacheFile := c.getCacheFilePath(opts.Query) + if info, err := os.Stat(cacheFile); err == nil && info.Size() > 0 { + // Return cached result + result := SearchResult{ + ID: c.generateImageID(opts.Query), + URL: cacheFile, + ThumbnailURL: cacheFile, + Width: c.getSizeWidth(), + Height: c.getSizeHeight(), + Description: fmt.Sprintf("Generated image for %s", opts.Query), + Attribution: "Generated by OpenAI DALL-E", + Source: "openai", + } + return []SearchResult{result}, nil + } + } + + // Translate Bulgarian word to English for better results + translatedWord := translateBulgarianToEnglish(opts.Query) + + // Create educational prompt + prompt := c.createEducationalPrompt(opts.Query, translatedWord) + + // Create the image generation request + req := openai.ImageRequest{ + Prompt: prompt, + Model: c.model, + Size: c.size, + ResponseFormat: openai.CreateImageResponseFormatURL, + N: 1, + } + + // Add model-specific parameters + if c.model == "dall-e-3" { + req.Quality = c.quality + req.Style = c.style + } + + // Generate the image + resp, err := c.client.CreateImage(ctx, req) + if err != nil { + return nil, &SearchError{ + Provider: "openai", + Code: "API_ERROR", + Message: fmt.Sprintf("Failed to generate image: %v", err), + } + } + + if len(resp.Data) == 0 { + return nil, &SearchError{ + Provider: "openai", + Code: "NO_RESULTS", + Message: "No image generated", + } + } + + // Get the generated image URL + imageURL := resp.Data[0].URL + + // Download and cache the image if caching is enabled + if c.enableCache { + cacheFile := c.getCacheFilePath(opts.Query) + if err := c.downloadAndCache(ctx, imageURL, cacheFile); err == nil { + // Update URL to point to cached file + imageURL = cacheFile + } + // Continue even if caching fails + } + + // Create result + result := SearchResult{ + ID: c.generateImageID(opts.Query), + URL: imageURL, + ThumbnailURL: imageURL, + Width: c.getSizeWidth(), + Height: c.getSizeHeight(), + Description: fmt.Sprintf("Generated educational image for %s (%s)", opts.Query, translatedWord), + Attribution: "Generated by OpenAI DALL-E", + Source: "openai", + } + + return []SearchResult{result}, nil +} + +// Download downloads an image from the given URL +func (c *OpenAIClient) Download(ctx context.Context, url string) (io.ReadCloser, error) { + // If it's a local cached file (not an HTTP/HTTPS URL), open it directly + if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") { + file, err := os.Open(url) + if err != nil { + return nil, fmt.Errorf("failed to open cached file: %w", err) + } + return file, nil + } + + // Otherwise download from URL + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) + } + + return resp.Body, nil +} + +// GetAttribution returns the required attribution text +func (c *OpenAIClient) GetAttribution(result *SearchResult) string { + return "Image generated by OpenAI DALL-E" +} + +// Name returns the name of the provider +func (c *OpenAIClient) Name() string { + return "openai" +} + +// createEducationalPrompt generates a prompt optimized for language learning +func (c *OpenAIClient) createEducationalPrompt(bulgarianWord, englishTranslation string) string { + // Create a prompt that generates clear, educational images + // suitable for language learning flashcards + return fmt.Sprintf( + "A simple, clear, photorealistic educational image showing %s, "+ + "suitable for language learning flashcards. "+ + "The image should be easily recognizable, with good lighting, "+ + "plain background, and focused on a single clear subject. "+ + "No text, labels, or writing in the image.", + englishTranslation, + ) +} + +// translateBulgarianToEnglish translates a Bulgarian word to English +func translateBulgarianToEnglish(word string) string { + // Use the existing translation function from translate.go + return translateBulgarianQuery(word) +} + +// getCacheFilePath generates a cache file path for the given word +func (c *OpenAIClient) getCacheFilePath(word string) string { + // Create a hash of the word and settings + h := md5.New() + h.Write([]byte(word)) + h.Write([]byte(c.model)) + h.Write([]byte(c.size)) + h.Write([]byte(c.quality)) + h.Write([]byte(c.style)) + hash := hex.EncodeToString(h.Sum(nil)) + + // Use first 2 chars as subdirectory for better file system performance + subdir := hash[:2] + filename := hash[2:] + ".png" + + return filepath.Join(c.cacheDir, subdir, filename) +} + +// downloadAndCache downloads an image and saves it to the cache +func (c *OpenAIClient) downloadAndCache(ctx context.Context, url, cacheFile string) error { + // Ensure directory exists + dir := filepath.Dir(cacheFile) + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + + // Download the image + resp, err := c.Download(ctx, url) + if err != nil { + return err + } + defer resp.Close() + + // Create the cache file + out, err := os.Create(cacheFile) + if err != nil { + return err + } + defer out.Close() + + // Copy the data + _, err = io.Copy(out, resp) + return err +} + +// generateImageID creates a unique ID for the image +func (c *OpenAIClient) generateImageID(word string) string { + h := md5.New() + h.Write([]byte(word)) + h.Write([]byte(c.model)) + return "openai_" + hex.EncodeToString(h.Sum(nil))[:8] +} + +// getSizeWidth returns the width based on the size setting +func (c *OpenAIClient) getSizeWidth() int { + switch c.size { + case "256x256": + return 256 + case "512x512": + return 512 + case "1024x1024": + return 1024 + case "1024x1792", "1792x1024": // DALL-E 3 sizes + if strings.HasPrefix(c.size, "1024") { + return 1024 + } + return 1792 + default: + return 512 + } +} + +// getSizeHeight returns the height based on the size setting +func (c *OpenAIClient) getSizeHeight() int { + switch c.size { + case "256x256": + return 256 + case "512x512": + return 512 + case "1024x1024": + return 1024 + case "1024x1792": + return 1792 + case "1792x1024": + return 1024 + default: + return 512 + } +}
\ No newline at end of file diff --git a/internal/image/openai_test.go b/internal/image/openai_test.go new file mode 100644 index 0000000..8f42aeb --- /dev/null +++ b/internal/image/openai_test.go @@ -0,0 +1,280 @@ +package image + +import ( + "context" + "os" + "testing" +) + +func TestOpenAIClient_NewClient(t *testing.T) { + tests := []struct { + name string + config *OpenAIConfig + wantNil bool + }{ + { + name: "with API key", + config: &OpenAIConfig{ + APIKey: "test-key", + Model: "dall-e-2", + Size: "512x512", + }, + wantNil: false, + }, + { + name: "without API key", + config: &OpenAIConfig{ + APIKey: "", + }, + wantNil: false, // Client is created but will fail on operations + }, + { + name: "with defaults", + config: &OpenAIConfig{ + APIKey: "test-key", + }, + wantNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client := NewOpenAIClient(tt.config) + if (client == nil) != tt.wantNil { + t.Errorf("NewOpenAIClient() returned nil = %v, want %v", client == nil, tt.wantNil) + } + + if client != nil && tt.config.APIKey != "" { + // Check defaults were set + if tt.config.Model == "" && client.model != "dall-e-2" { + t.Errorf("Expected default model dall-e-2, got %s", client.model) + } + if tt.config.Size == "" && client.size != "512x512" { + t.Errorf("Expected default size 512x512, got %s", client.size) + } + } + }) + } +} + +func TestOpenAIClient_createEducationalPrompt(t *testing.T) { + client := &OpenAIClient{} + + tests := []struct { + bulgarian string + english string + wantContains []string + }{ + { + bulgarian: "ябълка", + english: "apple", + wantContains: []string{"apple", "educational", "flashcard"}, + }, + { + bulgarian: "котка", + english: "cat", + wantContains: []string{"cat", "simple", "clear"}, + }, + } + + for _, tt := range tests { + t.Run(tt.bulgarian, func(t *testing.T) { + prompt := client.createEducationalPrompt(tt.bulgarian, tt.english) + + for _, want := range tt.wantContains { + if !contains(prompt, want) { + t.Errorf("Prompt missing expected word '%s': %s", want, prompt) + } + } + }) + } +} + +func TestOpenAIClient_getCacheFilePath(t *testing.T) { + client := &OpenAIClient{ + model: "dall-e-2", + size: "512x512", + quality: "standard", + style: "natural", + cacheDir: "./.test_cache", + } + + // Test that same input produces same cache path + path1 := client.getCacheFilePath("ябълка") + path2 := client.getCacheFilePath("ябълка") + + if path1 != path2 { + t.Errorf("Cache paths differ for same input: %s vs %s", path1, path2) + } + + // Test that different inputs produce different paths + path3 := client.getCacheFilePath("котка") + if path1 == path3 { + t.Errorf("Cache paths same for different inputs") + } + + // Test path structure + if !contains(path1, ".test_cache") { + t.Errorf("Cache path doesn't contain cache dir: %s", path1) + } + + if !contains(path1, ".png") { + t.Errorf("Cache path doesn't have .png extension: %s", path1) + } +} + +func TestOpenAIClient_translateBulgarianToEnglish(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"ябълка", "apple"}, + {"котка", "cat"}, + {"куче", "dog"}, + {"хляб", "bread"}, + {"unknown", "unknown"}, // Should return original if not in dictionary + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + result := translateBulgarianToEnglish(tt.input) + if result != tt.expected { + t.Errorf("translateBulgarianToEnglish(%s) = %s, want %s", + tt.input, result, tt.expected) + } + }) + } +} + +func TestOpenAIClient_getSizeWidthHeight(t *testing.T) { + tests := []struct { + size string + width int + height int + }{ + {"256x256", 256, 256}, + {"512x512", 512, 512}, + {"1024x1024", 1024, 1024}, + {"1024x1792", 1024, 1792}, + {"1792x1024", 1792, 1024}, + {"unknown", 512, 512}, // Default + } + + for _, tt := range tests { + t.Run(tt.size, func(t *testing.T) { + client := &OpenAIClient{size: tt.size} + + if w := client.getSizeWidth(); w != tt.width { + t.Errorf("getSizeWidth() = %d, want %d", w, tt.width) + } + + if h := client.getSizeHeight(); h != tt.height { + t.Errorf("getSizeHeight() = %d, want %d", h, tt.height) + } + }) + } +} + +func TestOpenAIClient_Search_NoAPIKey(t *testing.T) { + client := NewOpenAIClient(&OpenAIConfig{}) + + opts := DefaultSearchOptions("ябълка") + _, err := client.Search(context.Background(), opts) + + if err == nil { + t.Error("Expected error for missing API key") + } + + if searchErr, ok := err.(*SearchError); ok { + if searchErr.Code != "NO_API_KEY" { + t.Errorf("Expected NO_API_KEY error, got %s", searchErr.Code) + } + } else { + t.Error("Expected SearchError type") + } +} + +func TestOpenAIClient_Name(t *testing.T) { + client := &OpenAIClient{} + if name := client.Name(); name != "openai" { + t.Errorf("Name() = %s, want 'openai'", name) + } +} + +func TestOpenAIClient_GetAttribution(t *testing.T) { + client := &OpenAIClient{} + result := &SearchResult{} + + attr := client.GetAttribution(result) + if !contains(attr, "OpenAI DALL-E") { + t.Errorf("Attribution doesn't mention OpenAI DALL-E: %s", attr) + } +} + +// Helper function +func contains(s, substr string) bool { + return len(s) >= len(substr) && + (s == substr || len(s) > 0 && containsHelper(s, substr)) +} + +func containsHelper(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} + +// Integration test (skipped by default) +func TestOpenAIClient_Search_Integration(t *testing.T) { + apiKey := os.Getenv("OPENAI_API_KEY") + if apiKey == "" { + t.Skip("OPENAI_API_KEY not set, skipping integration test") + } + + client := NewOpenAIClient(&OpenAIConfig{ + APIKey: apiKey, + Model: "dall-e-2", + Size: "256x256", // Smallest size to minimize cost + EnableCache: true, + CacheDir: t.TempDir(), + }) + + opts := DefaultSearchOptions("ябълка") + results, err := client.Search(context.Background(), opts) + + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + if len(results) != 1 { + t.Fatalf("Expected 1 result, got %d", len(results)) + } + + result := results[0] + + // Check result fields + if result.ID == "" { + t.Error("Result ID is empty") + } + if result.URL == "" { + t.Error("Result URL is empty") + } + if result.Width != 256 || result.Height != 256 { + t.Errorf("Expected 256x256, got %dx%d", result.Width, result.Height) + } + if result.Source != "openai" { + t.Errorf("Expected source 'openai', got '%s'", result.Source) + } + + // Test caching - second request should use cache + results2, err := client.Search(context.Background(), opts) + if err != nil { + t.Fatalf("Second search failed: %v", err) + } + + if results2[0].URL != results[0].URL { + t.Log("Note: URLs differ, cache might not be working as expected") + } +}
\ No newline at end of file diff --git a/internal/version.go b/internal/version.go index 93a42a8..0894830 100644 --- a/internal/version.go +++ b/internal/version.go @@ -1,3 +1,3 @@ package internal -const Version = "0.0.0" +const Version = "0.1.0" |
