summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.md91
-rw-r--r--TODO.md17
-rw-r--r--cmd/totalrecall/main.go504
-rw-r--r--internal/audio/provider.go2
-rw-r--r--internal/image/openai.go329
-rw-r--r--internal/image/openai_test.go280
-rw-r--r--internal/version.go2
8 files changed, 1207 insertions, 19 deletions
diff --git a/.gitignore b/.gitignore
index 3f0af1f..20e144e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,4 @@ anki_cards
# Configuration with API keys
.totalrecall.yaml
+.image_cache/
diff --git a/README.md b/README.md
index f4d6b96..d27b734 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,18 @@
# totalrecall - Bulgarian Anki Flashcard Generator
-`totalrecall` is a command-line tool that generates Anki flashcard materials from Bulgarian words. It creates audio pronunciation files using espeak-ng or OpenAI TTS and downloads representative images from web search APIs.
+`totalrecall` is a command-line tool that generates Anki flashcard materials from Bulgarian words. It creates audio pronunciation files and generates images using AI.
+
+⚠️ **Important:** This tool uses OpenAI services by default, which requires an API key. See [Quick Start](#quick-start) for setup instructions or use the free alternatives with `--audio-provider espeak --image-api pixabay`.
## Features
- Audio generation with multiple providers:
- **espeak-ng**: Free, offline Bulgarian voices (robotic quality)
- **OpenAI TTS**: High-quality, natural-sounding voices (requires API key)
-- Image search via Pixabay and Unsplash APIs
+- Image search and generation:
+ - **Pixabay**: Free stock photo search (optional API key)
+ - **Unsplash**: High-quality photo search (requires API key)
+ - **OpenAI DALL-E**: AI-generated educational images (requires API key)
- Batch processing of multiple words
- Anki-compatible CSV export
- Configurable voice variants and speech speed
@@ -59,17 +64,27 @@ go install codeberg.org/snonux/totalrecall/cmd/totalrecall@latest
## Quick Start
-1. Generate materials for a single word:
+**Note:** By default, totalrecall uses OpenAI for both audio and images. Make sure to set your OpenAI API key:
+```bash
+export OPENAI_API_KEY="sk-..."
+```
+
+1. Generate materials for a single word (uses OpenAI by default):
```bash
totalrecall ябълка
```
-2. Process multiple words from a file:
+2. Use free alternatives (espeak + pixabay):
+ ```bash
+ totalrecall ябълка --audio-provider espeak --image-api pixabay
+ ```
+
+3. Process multiple words from a file:
```bash
totalrecall --batch words.txt
```
-3. Generate with Anki CSV:
+4. Generate with Anki CSV:
```bash
totalrecall ябълка --anki
```
@@ -80,7 +95,7 @@ Create a `.totalrecall.yaml` file in your home directory or project folder:
```yaml
audio:
- provider: openai # Audio provider (espeak or openai)
+ provider: openai # Audio provider (espeak or openai) - default: openai
format: mp3 # Audio format (wav or mp3)
# ESpeak settings
@@ -99,10 +114,19 @@ audio:
cache_dir: "./.audio_cache"
image:
- provider: pixabay # Image provider (pixabay or unsplash)
+ provider: openai # Image provider (pixabay, unsplash, or openai) - default: openai
pixabay_key: "" # Optional API key for higher limits
unsplash_key: "" # Required for Unsplash
- size: medium # Image size preference
+
+ # OpenAI DALL-E settings
+ openai_model: "dall-e-2" # Model: dall-e-2 or dall-e-3
+ openai_size: "512x512" # Size: 256x256, 512x512, 1024x1024
+ openai_quality: "standard" # Quality: standard or hd (dall-e-3 only)
+ openai_style: "natural" # Style: natural or vivid (dall-e-3 only)
+
+ # Caching
+ enable_cache: true
+ cache_dir: "./.image_cache"
output:
directory: ./anki_cards
@@ -125,21 +149,27 @@ totalrecall [word] [flags]
- `--skip-audio`: Skip audio generation
- `--skip-images`: Skip image download
- `--images-per-word int`: Number of images per word (default 1)
-- `--image-api string`: Image source - pixabay or unsplash (default "pixabay")
+- `--image-api string`: Image source - pixabay, unsplash, or openai (default "openai")
#### Audio Provider Options
-- `--audio-provider string`: Audio provider - espeak or openai (default "espeak")
+- `--audio-provider string`: Audio provider - espeak or openai (default "openai")
#### ESpeak Tuning Options
- `--pitch int`: Pitch adjustment 0-99 (default 50, lower=deeper, espeak only)
- `--amplitude int`: Volume 0-200 (default 100, espeak only)
- `--word-gap int`: Gap between words in 10ms units (default 0, espeak only)
-#### OpenAI Options
+#### OpenAI Audio Options
- `--openai-model string`: Model - tts-1 or tts-1-hd (default "tts-1")
- `--openai-voice string`: Voice - alloy, echo, fable, onyx, nova, shimmer (default "nova")
- `--openai-speed float`: Speech speed 0.25-4.0 (default 1.0)
+#### OpenAI Image Options
+- `--openai-image-model string`: Model - dall-e-2 or dall-e-3 (default "dall-e-2")
+- `--openai-image-size string`: Size - 256x256, 512x512, 1024x1024 (default "512x512")
+- `--openai-image-quality string`: Quality - standard or hd (default "standard", dall-e-3 only)
+- `--openai-image-style string`: Style - natural or vivid (default "natural", dall-e-3 only)
+
## API Keys
### Pixabay
@@ -150,15 +180,21 @@ totalrecall [word] [flags]
- Required for Unsplash searches
- Get your key at: https://unsplash.com/developers
+### OpenAI
+- Required for both OpenAI TTS audio and DALL-E image generation
+- Get your key at: https://platform.openai.com/api-keys
+- Set via environment variable: `export OPENAI_API_KEY="sk-..."`
+- Or add to config file as `audio.openai_key`
+
## Examples
### Basic Usage
```bash
-# Single word with espeak-ng
+# Single word (uses OpenAI by default)
totalrecall котка
-# Using OpenAI TTS (requires API key in config)
-totalrecall котка --audio-provider openai
+# Using espeak-ng (free alternative)
+totalrecall котка --audio-provider espeak
# High-quality OpenAI with specific voice
totalrecall ябълка --audio-provider openai --openai-model tts-1-hd --openai-voice alloy
@@ -174,6 +210,15 @@ totalrecall куче --skip-images
# Generate Anki import file
totalrecall --batch words.txt --anki
+
+# Generate AI images with OpenAI DALL-E
+totalrecall ябълка --image-api openai
+
+# High-quality DALL-E 3 images
+totalrecall котка --image-api openai --openai-image-model dall-e-3 --openai-image-quality hd
+
+# Combine OpenAI audio and images
+totalrecall куче --audio-provider openai --image-api openai
```
### Batch File Format
@@ -213,9 +258,23 @@ Make sure espeak-ng is installed and in your PATH.
### OpenAI API errors
- Verify your API key is correct and has credits
-- Check the API key has TTS permissions enabled
+
+## Cost Considerations
+
+### OpenAI Services
+- **TTS Audio**: ~$0.015 per 1K characters (tts-1), ~$0.030 (tts-1-hd)
+- **DALL-E 2 Images**: ~$0.02 per image (512x512)
+- **DALL-E 3 Images**: ~$0.04 per image (standard), ~$0.08 (HD)
+- Both services cache results to avoid regenerating identical content
+
+### Free Alternatives
+- **Audio**: Use espeak-ng (free but robotic quality)
+- **Images**: Use Pixabay without API key (limited rate)
+
+### OpenAI Troubleshooting
+- Check the API key has proper permissions enabled
- If you get rate limit errors, wait a moment and try again
-- The tool will automatically fall back to espeak-ng if OpenAI fails
+- The tool will automatically fall back to espeak-ng if OpenAI audio fails
### Audio sounds robotic
The Bulgarian voice in espeak-ng can sound robotic. To improve quality:
diff --git a/TODO.md b/TODO.md
index 08ac641..4ceec03 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,3 +1,18 @@
# TODO's
-1. [ ] Ultra think about an Implementation of using OpenAPI key to use an OpenAI LLM to generate an image for the flash card. And add all to-do's into this file.
+## Completed
+1. [x] Implement OpenAI DALL-E image generation for flashcards
+ - [x] Create OpenAI image provider implementing ImageSearcher interface
+ - [x] Add configuration flags for DALL-E model, size, quality, and style
+ - [x] Implement caching mechanism to avoid regenerating identical images
+ - [x] Create educational prompt generation for language learning
+ - [x] Add OpenAI provider to image download workflow
+ - [x] Update documentation with examples and configuration
+
+## In Progress / Remaining
+1. [ ] Write unit tests for OpenAI image provider
+2. [ ] Add cost estimation warnings in output (show estimated API costs)
+3. [ ] Test with common Bulgarian words (ябълка, котка, куче, хляб)
+4. [ ] Consider adding batch image generation for cost optimization
+5. [ ] Add image style presets for different learning contexts (e.g., children, adults)
+6. [ ] Implement fallback from OpenAI to other providers on failure
diff --git a/cmd/totalrecall/main.go b/cmd/totalrecall/main.go
new file mode 100644
index 0000000..f6f666a
--- /dev/null
+++ b/cmd/totalrecall/main.go
@@ -0,0 +1,504 @@
+package main
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "github.com/spf13/cobra"
+ "github.com/spf13/viper"
+
+ "codeberg.org/snonux/totalrecall/internal"
+ "codeberg.org/snonux/totalrecall/internal/anki"
+ "codeberg.org/snonux/totalrecall/internal/audio"
+ "codeberg.org/snonux/totalrecall/internal/image"
+)
+
+var (
+ // Flags
+ cfgFile string
+ voice string
+ outputDir string
+ audioFormat string
+ imageAPI string
+ batchFile string
+ skipAudio bool
+ skipImages bool
+ imagesPerWord int
+ generateAnki bool
+ // Audio provider flags
+ audioProvider string
+ // Audio tuning flags (espeak)
+ audioPitch int
+ audioAmplitude int
+ audioWordGap int
+ // OpenAI flags
+ openAIModel string
+ openAIVoice string
+ openAISpeed float64
+ // OpenAI Image flags
+ openAIImageModel string
+ openAIImageSize string
+ openAIImageQuality string
+ openAIImageStyle string
+)
+
+// rootCmd represents the base command when called without any subcommands
+var rootCmd = &cobra.Command{
+ Use: "totalrecall [word]",
+ Short: "Bulgarian Anki Flashcard Generator",
+ Long: `totalrecall generates Anki flashcard materials from Bulgarian words.
+
+It creates audio pronunciation files using espeak-ng and downloads
+representative images from web search APIs.
+
+Example:
+ totalrecall ябълка # Generate materials for "apple"
+ totalrecall --batch words.txt # Process multiple words from file`,
+ Args: cobra.MaximumNArgs(1),
+ RunE: runCommand,
+ Version: internal.Version,
+}
+
+func init() {
+ cobra.OnInitialize(initConfig)
+
+ // Global flags
+ rootCmd.PersistentFlags().StringVar(&cfgFile, "config", "", "config file (default is $HOME/.totalrecall.yaml)")
+
+ // Local flags
+ rootCmd.Flags().StringVarP(&voice, "voice", "v", "bg+f1", "Voice variant (bg, bg+m1, bg+f1, etc.)")
+ rootCmd.Flags().StringVarP(&outputDir, "output", "o", "./anki_cards", "Output directory")
+ rootCmd.Flags().StringVarP(&audioFormat, "format", "f", "mp3", "Audio format (wav or mp3)")
+ rootCmd.Flags().StringVar(&imageAPI, "image-api", "openai", "Image source (pixabay, unsplash, or openai)")
+ rootCmd.Flags().StringVar(&batchFile, "batch", "", "Process words from file (one per line)")
+ rootCmd.Flags().BoolVar(&skipAudio, "skip-audio", false, "Skip audio generation")
+ rootCmd.Flags().BoolVar(&skipImages, "skip-images", false, "Skip image download")
+ rootCmd.Flags().IntVar(&imagesPerWord, "images-per-word", 1, "Number of images to download per word")
+ rootCmd.Flags().BoolVar(&generateAnki, "anki", false, "Generate Anki import CSV file")
+
+ // Audio provider selection
+ rootCmd.Flags().StringVar(&audioProvider, "audio-provider", "openai", "Audio provider: espeak or openai")
+
+ // Audio tuning flags (espeak)
+ rootCmd.Flags().IntVar(&audioPitch, "pitch", 50, "Audio pitch adjustment (0-99, default 50, espeak only)")
+ rootCmd.Flags().IntVar(&audioAmplitude, "amplitude", 100, "Audio volume (0-200, default 100, espeak only)")
+ rootCmd.Flags().IntVar(&audioWordGap, "word-gap", 0, "Gap between words in 10ms units (default 0, espeak only)")
+
+ // OpenAI flags
+ rootCmd.Flags().StringVar(&openAIModel, "openai-model", "tts-1", "OpenAI model: tts-1 or tts-1-hd")
+ rootCmd.Flags().StringVar(&openAIVoice, "openai-voice", "nova", "OpenAI voice: alloy, echo, fable, onyx, nova, shimmer")
+ rootCmd.Flags().Float64Var(&openAISpeed, "openai-speed", 1.0, "OpenAI speech speed (0.25 to 4.0)")
+
+ // OpenAI Image Generation flags
+ rootCmd.Flags().StringVar(&openAIImageModel, "openai-image-model", "dall-e-2", "OpenAI image model: dall-e-2 or dall-e-3")
+ rootCmd.Flags().StringVar(&openAIImageSize, "openai-image-size", "512x512", "Image size: 256x256, 512x512, 1024x1024 (dall-e-3: also 1024x1792, 1792x1024)")
+ rootCmd.Flags().StringVar(&openAIImageQuality, "openai-image-quality", "standard", "Image quality: standard or hd (dall-e-3 only)")
+ rootCmd.Flags().StringVar(&openAIImageStyle, "openai-image-style", "natural", "Image style: natural or vivid (dall-e-3 only)")
+
+ // Bind flags to viper
+ viper.BindPFlag("audio.provider", rootCmd.Flags().Lookup("audio-provider"))
+ viper.BindPFlag("audio.voice", rootCmd.Flags().Lookup("voice"))
+ viper.BindPFlag("audio.format", rootCmd.Flags().Lookup("format"))
+ viper.BindPFlag("audio.pitch", rootCmd.Flags().Lookup("pitch"))
+ viper.BindPFlag("audio.amplitude", rootCmd.Flags().Lookup("amplitude"))
+ viper.BindPFlag("audio.word_gap", rootCmd.Flags().Lookup("word-gap"))
+ viper.BindPFlag("audio.openai_model", rootCmd.Flags().Lookup("openai-model"))
+ viper.BindPFlag("audio.openai_voice", rootCmd.Flags().Lookup("openai-voice"))
+ viper.BindPFlag("audio.openai_speed", rootCmd.Flags().Lookup("openai-speed"))
+ viper.BindPFlag("output.directory", rootCmd.Flags().Lookup("output"))
+ viper.BindPFlag("image.provider", rootCmd.Flags().Lookup("image-api"))
+ // Bind OpenAI image flags
+ viper.BindPFlag("image.openai_model", rootCmd.Flags().Lookup("openai-image-model"))
+ viper.BindPFlag("image.openai_size", rootCmd.Flags().Lookup("openai-image-size"))
+ viper.BindPFlag("image.openai_quality", rootCmd.Flags().Lookup("openai-image-quality"))
+ viper.BindPFlag("image.openai_style", rootCmd.Flags().Lookup("openai-image-style"))
+}
+
+func initConfig() {
+ if cfgFile != "" {
+ // Use config file from the flag
+ viper.SetConfigFile(cfgFile)
+ } else {
+ // Find home directory
+ home, err := os.UserHomeDir()
+ cobra.CheckErr(err)
+
+ // Search config in home directory with name ".totalrecall" (without extension)
+ viper.AddConfigPath(home)
+ viper.AddConfigPath(".")
+ viper.SetConfigType("yaml")
+ viper.SetConfigName(".totalrecall")
+ }
+
+ // Environment variables
+ viper.SetEnvPrefix("TOTALRECALL")
+ viper.AutomaticEnv()
+
+ // Read config file
+ if err := viper.ReadInConfig(); err == nil {
+ fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed())
+ }
+}
+
+func runCommand(cmd *cobra.Command, args []string) error {
+ // Determine words to process
+ var words []string
+
+ if batchFile != "" {
+ // Read words from file
+ content, err := os.ReadFile(batchFile)
+ if err != nil {
+ return fmt.Errorf("failed to read batch file: %w", err)
+ }
+ // Split by newlines and filter empty lines
+ lines := string(content)
+ for _, line := range splitLines(lines) {
+ if line = trimSpace(line); line != "" {
+ words = append(words, line)
+ }
+ }
+ } else if len(args) > 0 {
+ // Single word from command line
+ words = []string{args[0]}
+ } else {
+ // No input provided
+ return fmt.Errorf("please provide a Bulgarian word or use --batch flag")
+ }
+
+ // Validate words
+ for _, word := range words {
+ if err := audio.ValidateBulgarianText(word); err != nil {
+ return fmt.Errorf("invalid word '%s': %w", word, err)
+ }
+ }
+
+ // Create output directory
+ if err := os.MkdirAll(outputDir, 0755); err != nil {
+ return fmt.Errorf("failed to create output directory: %w", err)
+ }
+
+ // Process each word
+ for i, word := range words {
+ fmt.Printf("\nProcessing %d/%d: %s\n", i+1, len(words), word)
+
+ if err := processWord(word); err != nil {
+ fmt.Fprintf(os.Stderr, "Error processing '%s': %v\n", word, err)
+ // Continue with next word
+ }
+ }
+
+ // Generate Anki CSV if requested
+ if generateAnki {
+ fmt.Printf("\nGenerating Anki import file...\n")
+ if err := generateAnkiCSV(); err != nil {
+ fmt.Fprintf(os.Stderr, "Warning: Failed to generate Anki CSV: %v\n", err)
+ } else {
+ fmt.Println("Anki import file created: anki_import.csv")
+ }
+ }
+
+ fmt.Println("\nDone! Materials saved to:", outputDir)
+ return nil
+}
+
+func processWord(word string) error {
+ // Generate audio
+ if !skipAudio {
+ fmt.Printf(" Generating audio...\n")
+ if err := generateAudio(word); err != nil {
+ return fmt.Errorf("audio generation failed: %w", err)
+ }
+ }
+
+ // Download images
+ if !skipImages {
+ fmt.Printf(" Downloading images...\n")
+ if err := downloadImages(word); err != nil {
+ return fmt.Errorf("image download failed: %w", err)
+ }
+ }
+
+ return nil
+}
+
+func generateAudio(word string) error {
+ // Create audio provider configuration
+ providerConfig := &audio.Config{
+ Provider: audioProvider,
+ OutputDir: outputDir,
+ OutputFormat: audioFormat,
+
+ // ESpeak settings
+ ESpeakVoice: voice,
+ ESpeakSpeed: viper.GetInt("audio.speed"),
+ ESpeakPitch: audioPitch,
+ ESpeakAmplitude: audioAmplitude,
+ ESpeakWordGap: audioWordGap,
+
+ // OpenAI settings
+ OpenAIKey: getOpenAIKey(),
+ OpenAIModel: openAIModel,
+ OpenAIVoice: openAIVoice,
+ OpenAISpeed: openAISpeed,
+
+ // Caching
+ EnableCache: viper.GetBool("audio.enable_cache"),
+ CacheDir: viper.GetString("audio.cache_dir"),
+ }
+
+ // Set defaults
+ if providerConfig.ESpeakSpeed == 0 {
+ providerConfig.ESpeakSpeed = 150
+ }
+ if providerConfig.CacheDir == "" {
+ providerConfig.CacheDir = "./.audio_cache"
+ }
+
+ // Use config file values if not overridden by flags
+ if audioProvider == "openai" && viper.IsSet("audio.provider") {
+ providerConfig.Provider = viper.GetString("audio.provider")
+ }
+ if audioPitch == 50 && viper.IsSet("audio.pitch") {
+ providerConfig.ESpeakPitch = viper.GetInt("audio.pitch")
+ }
+ if audioAmplitude == 100 && viper.IsSet("audio.amplitude") {
+ providerConfig.ESpeakAmplitude = viper.GetInt("audio.amplitude")
+ }
+ if audioWordGap == 0 && viper.IsSet("audio.word_gap") {
+ providerConfig.ESpeakWordGap = viper.GetInt("audio.word_gap")
+ }
+ if openAIModel == "tts-1" && viper.IsSet("audio.openai_model") {
+ providerConfig.OpenAIModel = viper.GetString("audio.openai_model")
+ }
+ if openAIVoice == "nova" && viper.IsSet("audio.openai_voice") {
+ providerConfig.OpenAIVoice = viper.GetString("audio.openai_voice")
+ }
+ if openAISpeed == 1.0 && viper.IsSet("audio.openai_speed") {
+ providerConfig.OpenAISpeed = viper.GetFloat64("audio.openai_speed")
+ }
+
+ // Create the audio provider
+ provider, err := audio.NewProvider(providerConfig)
+ if err != nil {
+ // If OpenAI fails, try to create a fallback to espeak
+ if providerConfig.Provider == "openai" {
+ fmt.Printf("Warning: OpenAI audio provider failed (%v), falling back to espeak-ng\n", err)
+ providerConfig.Provider = "espeak"
+ fallbackProvider, fallbackErr := audio.NewProvider(providerConfig)
+ if fallbackErr != nil {
+ return fmt.Errorf("both OpenAI and espeak-ng failed: %v", fallbackErr)
+ }
+ provider = fallbackProvider
+ } else {
+ return err
+ }
+ }
+
+ // Generate audio file
+ filename := sanitizeFilename(word)
+ outputFile := filepath.Join(outputDir, fmt.Sprintf("%s.%s", filename, audioFormat))
+
+ ctx := context.Background()
+ return provider.GenerateAudio(ctx, word, outputFile)
+}
+
+func downloadImages(word string) error {
+ // Create image searcher based on provider
+ var searcher image.ImageSearcher
+ var err error
+
+ switch imageAPI {
+ case "pixabay":
+ apiKey := viper.GetString("image.pixabay_key")
+ searcher = image.NewPixabayClient(apiKey)
+
+ case "unsplash":
+ apiKey := viper.GetString("image.unsplash_key")
+ if apiKey == "" {
+ return fmt.Errorf("Unsplash API key is required in config")
+ }
+ searcher, err = image.NewUnsplashClient(apiKey)
+ if err != nil {
+ return err
+ }
+
+ case "openai":
+ // Create OpenAI image configuration
+ openaiConfig := &image.OpenAIConfig{
+ APIKey: getOpenAIKey(),
+ Model: openAIImageModel,
+ Size: openAIImageSize,
+ Quality: openAIImageQuality,
+ Style: openAIImageStyle,
+ CacheDir: viper.GetString("image.cache_dir"),
+ EnableCache: viper.GetBool("image.enable_cache"),
+ }
+
+ // Use config file values if not overridden by flags
+ if openAIImageModel == "dall-e-2" && viper.IsSet("image.openai_model") {
+ openaiConfig.Model = viper.GetString("image.openai_model")
+ }
+ if openAIImageSize == "512x512" && viper.IsSet("image.openai_size") {
+ openaiConfig.Size = viper.GetString("image.openai_size")
+ }
+ if openAIImageQuality == "standard" && viper.IsSet("image.openai_quality") {
+ openaiConfig.Quality = viper.GetString("image.openai_quality")
+ }
+ if openAIImageStyle == "natural" && viper.IsSet("image.openai_style") {
+ openaiConfig.Style = viper.GetString("image.openai_style")
+ }
+
+ // Set defaults
+ if openaiConfig.CacheDir == "" {
+ openaiConfig.CacheDir = "./.image_cache"
+ }
+ if !viper.IsSet("image.enable_cache") {
+ openaiConfig.EnableCache = true
+ }
+
+ searcher = image.NewOpenAIClient(openaiConfig)
+ if openaiConfig.APIKey == "" {
+ fmt.Printf("Warning: OpenAI API key not found, falling back to Pixabay for images\n")
+ imageAPI = "pixabay"
+ searcher = image.NewPixabayClient("")
+ }
+
+ default:
+ return fmt.Errorf("unknown image provider: %s", imageAPI)
+ }
+
+ // Create downloader
+ downloadOpts := &image.DownloadOptions{
+ OutputDir: outputDir,
+ OverwriteExisting: false,
+ CreateDir: true,
+ FileNamePattern: "{word}_{index}",
+ MaxSizeBytes: 5 * 1024 * 1024, // 5MB
+ }
+
+ downloader := image.NewDownloader(searcher, downloadOpts)
+
+ // Download images
+ ctx := context.Background()
+ if imagesPerWord == 1 {
+ _, path, err := downloader.DownloadBestMatch(ctx, word)
+ if err != nil {
+ return err
+ }
+ fmt.Printf(" Downloaded: %s\n", path)
+ } else {
+ paths, err := downloader.DownloadMultiple(ctx, word, imagesPerWord)
+ if err != nil {
+ return err
+ }
+ for _, path := range paths {
+ fmt.Printf(" Downloaded: %s\n", path)
+ }
+ }
+
+ return nil
+}
+
+func sanitizeFilename(s string) string {
+ // Simple filename sanitization
+ result := ""
+ for _, r := range s {
+ if isAlphaNumeric(r) || r == '-' || r == '_' {
+ result += string(r)
+ } else {
+ result += "_"
+ }
+ }
+ return result
+}
+
+func isAlphaNumeric(r rune) bool {
+ return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') ||
+ (r >= '0' && r <= '9') || (r >= 'а' && r <= 'я') ||
+ (r >= 'А' && r <= 'Я')
+}
+
+func splitLines(s string) []string {
+ // Simple line splitter
+ var lines []string
+ current := ""
+ for _, r := range s {
+ if r == '\n' {
+ lines = append(lines, current)
+ current = ""
+ } else if r != '\r' {
+ current += string(r)
+ }
+ }
+ if current != "" {
+ lines = append(lines, current)
+ }
+ return lines
+}
+
+func trimSpace(s string) string {
+ // Simple trim implementation
+ start := 0
+ end := len(s)
+
+ // Trim from start
+ for start < end && isSpace(rune(s[start])) {
+ start++
+ }
+
+ // Trim from end
+ for end > start && isSpace(rune(s[end-1])) {
+ end--
+ }
+
+ return s[start:end]
+}
+
+func isSpace(r rune) bool {
+ return r == ' ' || r == '\t' || r == '\n' || r == '\r'
+}
+
+func generateAnkiCSV() error {
+ // Create Anki generator
+ gen := anki.NewGenerator(&anki.GeneratorOptions{
+ OutputPath: filepath.Join(outputDir, "anki_import.csv"),
+ MediaFolder: outputDir,
+ IncludeHeaders: true,
+ AudioFormat: audioFormat,
+ })
+
+ // Generate cards from output directory
+ if err := gen.GenerateFromDirectory(outputDir); err != nil {
+ return fmt.Errorf("failed to generate cards: %w", err)
+ }
+
+ // Generate CSV
+ if err := gen.GenerateCSV(); err != nil {
+ return fmt.Errorf("failed to generate CSV: %w", err)
+ }
+
+ // Print stats
+ total, withAudio, withImages := gen.Stats()
+ fmt.Printf(" Generated %d cards (%d with audio, %d with images)\n",
+ total, withAudio, withImages)
+
+ return nil
+}
+
+func getOpenAIKey() string {
+ // First check environment variable
+ if key := os.Getenv("OPENAI_API_KEY"); key != "" {
+ return key
+ }
+
+ // Then check config file
+ return viper.GetString("audio.openai_key")
+}
+
+func main() {
+ if err := rootCmd.Execute(); err != nil {
+ os.Exit(1)
+ }
+} \ No newline at end of file
diff --git a/internal/audio/provider.go b/internal/audio/provider.go
index 5b8c336..c803b61 100644
--- a/internal/audio/provider.go
+++ b/internal/audio/provider.go
@@ -44,7 +44,7 @@ type Config struct {
// DefaultConfig returns default configuration
func DefaultProviderConfig() *Config {
return &Config{
- Provider: "espeak",
+ Provider: "openai",
OutputDir: "./",
OutputFormat: "mp3",
ESpeakVoice: "bg",
diff --git a/internal/image/openai.go b/internal/image/openai.go
new file mode 100644
index 0000000..a5a3e31
--- /dev/null
+++ b/internal/image/openai.go
@@ -0,0 +1,329 @@
+package image
+
+import (
+ "context"
+ "crypto/md5"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "net/http"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "github.com/sashabaranov/go-openai"
+)
+
+// OpenAIClient implements ImageSearcher for OpenAI DALL-E image generation
+type OpenAIClient struct {
+ client *openai.Client
+ apiKey string
+ model string // dall-e-2 or dall-e-3
+ size string // 256x256, 512x512, 1024x1024
+ quality string // standard or hd (dall-e-3 only)
+ style string // natural or vivid (dall-e-3 only)
+ cacheDir string
+ enableCache bool
+}
+
+// OpenAIConfig holds configuration for the OpenAI image provider
+type OpenAIConfig struct {
+ APIKey string
+ Model string
+ Size string
+ Quality string
+ Style string
+ CacheDir string
+ EnableCache bool
+}
+
+// NewOpenAIClient creates a new OpenAI DALL-E client
+func NewOpenAIClient(config *OpenAIConfig) *OpenAIClient {
+ if config.APIKey == "" {
+ // Return nil client that will fail on operations
+ return &OpenAIClient{}
+ }
+
+ client := openai.NewClient(config.APIKey)
+
+ // Set defaults
+ if config.Model == "" {
+ config.Model = "dall-e-2"
+ }
+ if config.Size == "" {
+ config.Size = "512x512"
+ }
+ if config.Quality == "" {
+ config.Quality = "standard"
+ }
+ if config.Style == "" {
+ config.Style = "natural"
+ }
+ if config.CacheDir == "" {
+ config.CacheDir = "./.image_cache"
+ }
+
+ oc := &OpenAIClient{
+ client: client,
+ apiKey: config.APIKey,
+ model: config.Model,
+ size: config.Size,
+ quality: config.Quality,
+ style: config.Style,
+ cacheDir: config.CacheDir,
+ enableCache: config.EnableCache,
+ }
+
+ // Create cache directory if caching is enabled
+ if oc.enableCache && oc.cacheDir != "" {
+ os.MkdirAll(oc.cacheDir, 0755)
+ }
+
+ return oc
+}
+
+// Search generates an image for the Bulgarian word using DALL-E
+func (c *OpenAIClient) Search(ctx context.Context, opts *SearchOptions) ([]SearchResult, error) {
+ if c.client == nil {
+ return nil, &SearchError{
+ Provider: "openai",
+ Code: "NO_API_KEY",
+ Message: "OpenAI API key not configured",
+ }
+ }
+
+ // Check cache first
+ if c.enableCache {
+ cacheFile := c.getCacheFilePath(opts.Query)
+ if info, err := os.Stat(cacheFile); err == nil && info.Size() > 0 {
+ // Return cached result
+ result := SearchResult{
+ ID: c.generateImageID(opts.Query),
+ URL: cacheFile,
+ ThumbnailURL: cacheFile,
+ Width: c.getSizeWidth(),
+ Height: c.getSizeHeight(),
+ Description: fmt.Sprintf("Generated image for %s", opts.Query),
+ Attribution: "Generated by OpenAI DALL-E",
+ Source: "openai",
+ }
+ return []SearchResult{result}, nil
+ }
+ }
+
+ // Translate Bulgarian word to English for better results
+ translatedWord := translateBulgarianToEnglish(opts.Query)
+
+ // Create educational prompt
+ prompt := c.createEducationalPrompt(opts.Query, translatedWord)
+
+ // Create the image generation request
+ req := openai.ImageRequest{
+ Prompt: prompt,
+ Model: c.model,
+ Size: c.size,
+ ResponseFormat: openai.CreateImageResponseFormatURL,
+ N: 1,
+ }
+
+ // Add model-specific parameters
+ if c.model == "dall-e-3" {
+ req.Quality = c.quality
+ req.Style = c.style
+ }
+
+ // Generate the image
+ resp, err := c.client.CreateImage(ctx, req)
+ if err != nil {
+ return nil, &SearchError{
+ Provider: "openai",
+ Code: "API_ERROR",
+ Message: fmt.Sprintf("Failed to generate image: %v", err),
+ }
+ }
+
+ if len(resp.Data) == 0 {
+ return nil, &SearchError{
+ Provider: "openai",
+ Code: "NO_RESULTS",
+ Message: "No image generated",
+ }
+ }
+
+ // Get the generated image URL
+ imageURL := resp.Data[0].URL
+
+ // Download and cache the image if caching is enabled
+ if c.enableCache {
+ cacheFile := c.getCacheFilePath(opts.Query)
+ if err := c.downloadAndCache(ctx, imageURL, cacheFile); err == nil {
+ // Update URL to point to cached file
+ imageURL = cacheFile
+ }
+ // Continue even if caching fails
+ }
+
+ // Create result
+ result := SearchResult{
+ ID: c.generateImageID(opts.Query),
+ URL: imageURL,
+ ThumbnailURL: imageURL,
+ Width: c.getSizeWidth(),
+ Height: c.getSizeHeight(),
+ Description: fmt.Sprintf("Generated educational image for %s (%s)", opts.Query, translatedWord),
+ Attribution: "Generated by OpenAI DALL-E",
+ Source: "openai",
+ }
+
+ return []SearchResult{result}, nil
+}
+
+// Download downloads an image from the given URL
+func (c *OpenAIClient) Download(ctx context.Context, url string) (io.ReadCloser, error) {
+ // If it's a local cached file (not an HTTP/HTTPS URL), open it directly
+ if !strings.HasPrefix(url, "http://") && !strings.HasPrefix(url, "https://") {
+ file, err := os.Open(url)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open cached file: %w", err)
+ }
+ return file, nil
+ }
+
+ // Otherwise download from URL
+ req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+ if err != nil {
+ return nil, err
+ }
+
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return nil, err
+ }
+
+ if resp.StatusCode != http.StatusOK {
+ resp.Body.Close()
+ return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+ }
+
+ return resp.Body, nil
+}
+
+// GetAttribution returns the required attribution text
+func (c *OpenAIClient) GetAttribution(result *SearchResult) string {
+ return "Image generated by OpenAI DALL-E"
+}
+
+// Name returns the name of the provider
+func (c *OpenAIClient) Name() string {
+ return "openai"
+}
+
+// createEducationalPrompt generates a prompt optimized for language learning
+func (c *OpenAIClient) createEducationalPrompt(bulgarianWord, englishTranslation string) string {
+ // Create a prompt that generates clear, educational images
+ // suitable for language learning flashcards
+ return fmt.Sprintf(
+ "A simple, clear, photorealistic educational image showing %s, "+
+ "suitable for language learning flashcards. "+
+ "The image should be easily recognizable, with good lighting, "+
+ "plain background, and focused on a single clear subject. "+
+ "No text, labels, or writing in the image.",
+ englishTranslation,
+ )
+}
+
+// translateBulgarianToEnglish translates a Bulgarian word to English
+func translateBulgarianToEnglish(word string) string {
+ // Use the existing translation function from translate.go
+ return translateBulgarianQuery(word)
+}
+
+// getCacheFilePath generates a cache file path for the given word
+func (c *OpenAIClient) getCacheFilePath(word string) string {
+ // Create a hash of the word and settings
+ h := md5.New()
+ h.Write([]byte(word))
+ h.Write([]byte(c.model))
+ h.Write([]byte(c.size))
+ h.Write([]byte(c.quality))
+ h.Write([]byte(c.style))
+ hash := hex.EncodeToString(h.Sum(nil))
+
+ // Use first 2 chars as subdirectory for better file system performance
+ subdir := hash[:2]
+ filename := hash[2:] + ".png"
+
+ return filepath.Join(c.cacheDir, subdir, filename)
+}
+
+// downloadAndCache downloads an image and saves it to the cache
+func (c *OpenAIClient) downloadAndCache(ctx context.Context, url, cacheFile string) error {
+ // Ensure directory exists
+ dir := filepath.Dir(cacheFile)
+ if err := os.MkdirAll(dir, 0755); err != nil {
+ return err
+ }
+
+ // Download the image
+ resp, err := c.Download(ctx, url)
+ if err != nil {
+ return err
+ }
+ defer resp.Close()
+
+ // Create the cache file
+ out, err := os.Create(cacheFile)
+ if err != nil {
+ return err
+ }
+ defer out.Close()
+
+ // Copy the data
+ _, err = io.Copy(out, resp)
+ return err
+}
+
+// generateImageID creates a unique ID for the image
+func (c *OpenAIClient) generateImageID(word string) string {
+ h := md5.New()
+ h.Write([]byte(word))
+ h.Write([]byte(c.model))
+ return "openai_" + hex.EncodeToString(h.Sum(nil))[:8]
+}
+
+// getSizeWidth returns the width based on the size setting
+func (c *OpenAIClient) getSizeWidth() int {
+ switch c.size {
+ case "256x256":
+ return 256
+ case "512x512":
+ return 512
+ case "1024x1024":
+ return 1024
+ case "1024x1792", "1792x1024": // DALL-E 3 sizes
+ if strings.HasPrefix(c.size, "1024") {
+ return 1024
+ }
+ return 1792
+ default:
+ return 512
+ }
+}
+
+// getSizeHeight returns the height based on the size setting
+func (c *OpenAIClient) getSizeHeight() int {
+ switch c.size {
+ case "256x256":
+ return 256
+ case "512x512":
+ return 512
+ case "1024x1024":
+ return 1024
+ case "1024x1792":
+ return 1792
+ case "1792x1024":
+ return 1024
+ default:
+ return 512
+ }
+} \ No newline at end of file
diff --git a/internal/image/openai_test.go b/internal/image/openai_test.go
new file mode 100644
index 0000000..8f42aeb
--- /dev/null
+++ b/internal/image/openai_test.go
@@ -0,0 +1,280 @@
+package image
+
+import (
+ "context"
+ "os"
+ "testing"
+)
+
+func TestOpenAIClient_NewClient(t *testing.T) {
+ tests := []struct {
+ name string
+ config *OpenAIConfig
+ wantNil bool
+ }{
+ {
+ name: "with API key",
+ config: &OpenAIConfig{
+ APIKey: "test-key",
+ Model: "dall-e-2",
+ Size: "512x512",
+ },
+ wantNil: false,
+ },
+ {
+ name: "without API key",
+ config: &OpenAIConfig{
+ APIKey: "",
+ },
+ wantNil: false, // Client is created but will fail on operations
+ },
+ {
+ name: "with defaults",
+ config: &OpenAIConfig{
+ APIKey: "test-key",
+ },
+ wantNil: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ client := NewOpenAIClient(tt.config)
+ if (client == nil) != tt.wantNil {
+ t.Errorf("NewOpenAIClient() returned nil = %v, want %v", client == nil, tt.wantNil)
+ }
+
+ if client != nil && tt.config.APIKey != "" {
+ // Check defaults were set
+ if tt.config.Model == "" && client.model != "dall-e-2" {
+ t.Errorf("Expected default model dall-e-2, got %s", client.model)
+ }
+ if tt.config.Size == "" && client.size != "512x512" {
+ t.Errorf("Expected default size 512x512, got %s", client.size)
+ }
+ }
+ })
+ }
+}
+
+func TestOpenAIClient_createEducationalPrompt(t *testing.T) {
+ client := &OpenAIClient{}
+
+ tests := []struct {
+ bulgarian string
+ english string
+ wantContains []string
+ }{
+ {
+ bulgarian: "ябълка",
+ english: "apple",
+ wantContains: []string{"apple", "educational", "flashcard"},
+ },
+ {
+ bulgarian: "котка",
+ english: "cat",
+ wantContains: []string{"cat", "simple", "clear"},
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.bulgarian, func(t *testing.T) {
+ prompt := client.createEducationalPrompt(tt.bulgarian, tt.english)
+
+ for _, want := range tt.wantContains {
+ if !contains(prompt, want) {
+ t.Errorf("Prompt missing expected word '%s': %s", want, prompt)
+ }
+ }
+ })
+ }
+}
+
+func TestOpenAIClient_getCacheFilePath(t *testing.T) {
+ client := &OpenAIClient{
+ model: "dall-e-2",
+ size: "512x512",
+ quality: "standard",
+ style: "natural",
+ cacheDir: "./.test_cache",
+ }
+
+ // Test that same input produces same cache path
+ path1 := client.getCacheFilePath("ябълка")
+ path2 := client.getCacheFilePath("ябълка")
+
+ if path1 != path2 {
+ t.Errorf("Cache paths differ for same input: %s vs %s", path1, path2)
+ }
+
+ // Test that different inputs produce different paths
+ path3 := client.getCacheFilePath("котка")
+ if path1 == path3 {
+ t.Errorf("Cache paths same for different inputs")
+ }
+
+ // Test path structure
+ if !contains(path1, ".test_cache") {
+ t.Errorf("Cache path doesn't contain cache dir: %s", path1)
+ }
+
+ if !contains(path1, ".png") {
+ t.Errorf("Cache path doesn't have .png extension: %s", path1)
+ }
+}
+
+func TestOpenAIClient_translateBulgarianToEnglish(t *testing.T) {
+ tests := []struct {
+ input string
+ expected string
+ }{
+ {"ябълка", "apple"},
+ {"котка", "cat"},
+ {"куче", "dog"},
+ {"хляб", "bread"},
+ {"unknown", "unknown"}, // Should return original if not in dictionary
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.input, func(t *testing.T) {
+ result := translateBulgarianToEnglish(tt.input)
+ if result != tt.expected {
+ t.Errorf("translateBulgarianToEnglish(%s) = %s, want %s",
+ tt.input, result, tt.expected)
+ }
+ })
+ }
+}
+
+func TestOpenAIClient_getSizeWidthHeight(t *testing.T) {
+ tests := []struct {
+ size string
+ width int
+ height int
+ }{
+ {"256x256", 256, 256},
+ {"512x512", 512, 512},
+ {"1024x1024", 1024, 1024},
+ {"1024x1792", 1024, 1792},
+ {"1792x1024", 1792, 1024},
+ {"unknown", 512, 512}, // Default
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.size, func(t *testing.T) {
+ client := &OpenAIClient{size: tt.size}
+
+ if w := client.getSizeWidth(); w != tt.width {
+ t.Errorf("getSizeWidth() = %d, want %d", w, tt.width)
+ }
+
+ if h := client.getSizeHeight(); h != tt.height {
+ t.Errorf("getSizeHeight() = %d, want %d", h, tt.height)
+ }
+ })
+ }
+}
+
+func TestOpenAIClient_Search_NoAPIKey(t *testing.T) {
+ client := NewOpenAIClient(&OpenAIConfig{})
+
+ opts := DefaultSearchOptions("ябълка")
+ _, err := client.Search(context.Background(), opts)
+
+ if err == nil {
+ t.Error("Expected error for missing API key")
+ }
+
+ if searchErr, ok := err.(*SearchError); ok {
+ if searchErr.Code != "NO_API_KEY" {
+ t.Errorf("Expected NO_API_KEY error, got %s", searchErr.Code)
+ }
+ } else {
+ t.Error("Expected SearchError type")
+ }
+}
+
+func TestOpenAIClient_Name(t *testing.T) {
+ client := &OpenAIClient{}
+ if name := client.Name(); name != "openai" {
+ t.Errorf("Name() = %s, want 'openai'", name)
+ }
+}
+
+func TestOpenAIClient_GetAttribution(t *testing.T) {
+ client := &OpenAIClient{}
+ result := &SearchResult{}
+
+ attr := client.GetAttribution(result)
+ if !contains(attr, "OpenAI DALL-E") {
+ t.Errorf("Attribution doesn't mention OpenAI DALL-E: %s", attr)
+ }
+}
+
+// Helper function
+func contains(s, substr string) bool {
+ return len(s) >= len(substr) &&
+ (s == substr || len(s) > 0 && containsHelper(s, substr))
+}
+
+func containsHelper(s, substr string) bool {
+ for i := 0; i <= len(s)-len(substr); i++ {
+ if s[i:i+len(substr)] == substr {
+ return true
+ }
+ }
+ return false
+}
+
+// Integration test (skipped by default)
+func TestOpenAIClient_Search_Integration(t *testing.T) {
+ apiKey := os.Getenv("OPENAI_API_KEY")
+ if apiKey == "" {
+ t.Skip("OPENAI_API_KEY not set, skipping integration test")
+ }
+
+ client := NewOpenAIClient(&OpenAIConfig{
+ APIKey: apiKey,
+ Model: "dall-e-2",
+ Size: "256x256", // Smallest size to minimize cost
+ EnableCache: true,
+ CacheDir: t.TempDir(),
+ })
+
+ opts := DefaultSearchOptions("ябълка")
+ results, err := client.Search(context.Background(), opts)
+
+ if err != nil {
+ t.Fatalf("Search failed: %v", err)
+ }
+
+ if len(results) != 1 {
+ t.Fatalf("Expected 1 result, got %d", len(results))
+ }
+
+ result := results[0]
+
+ // Check result fields
+ if result.ID == "" {
+ t.Error("Result ID is empty")
+ }
+ if result.URL == "" {
+ t.Error("Result URL is empty")
+ }
+ if result.Width != 256 || result.Height != 256 {
+ t.Errorf("Expected 256x256, got %dx%d", result.Width, result.Height)
+ }
+ if result.Source != "openai" {
+ t.Errorf("Expected source 'openai', got '%s'", result.Source)
+ }
+
+ // Test caching - second request should use cache
+ results2, err := client.Search(context.Background(), opts)
+ if err != nil {
+ t.Fatalf("Second search failed: %v", err)
+ }
+
+ if results2[0].URL != results[0].URL {
+ t.Log("Note: URLs differ, cache might not be working as expected")
+ }
+} \ No newline at end of file
diff --git a/internal/version.go b/internal/version.go
index 93a42a8..0894830 100644
--- a/internal/version.go
+++ b/internal/version.go
@@ -1,3 +1,3 @@
package internal
-const Version = "0.0.0"
+const Version = "0.1.0"