#!/usr/bin/env node

/**
 * ESP32 Audio UDP Receiver with Google Gemini API
 * Uses Gemini 2.0 Flash model instead of OpenAI
 * 
 * Setup:
 * 1. Get API key from https://aistudio.google.com/apikey
 * 2. export GOOGLE_API_KEY='your-key-here'
 * 3. npm install @google/generative-ai
 * 4. node server-gemini.js
 */

const dgram = require("dgram");
const { spawn } = require("child_process");

const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const BITS_PER_SAMPLE = 16;
const UDP_PORT = 8000;
// UDP port on ESP32 where it listens for downlink speaker packets
const DOWNLINK_PORT = 9999;
// chunk samples used by ESP32 (must match sketch)
const CHUNK_SAMPLES = 256;
// How long (ms) of silence before we consider the user finished speaking
const SILENCE_TIMEOUT_MS = 800;
const STATS_INTERVAL_MS = 5000;
const SILENCE_CHECK_INTERVAL_MS = 100;

// Minimum audio duration (ms) to send to Gemini. Prevents sending tiny packets.
const MIN_REQUEST_DURATION_MS = 500;

// Minimum time (ms) between successive Gemini requests to avoid quota burn.
const REQUEST_COOLDOWN_MS = 2000;

const STATE = {
  SILENT: "silent",
  SPEAKING: "speaking",
};

const server = dgram.createSocket("udp4");

let currentState = STATE.SILENT;
let audioBuffer = [];
let lastPacketTime = null;
let silenceCheckInterval = null;
let isProcessing = false;
let packetsReceived = 0;
let bytesReceived = 0;
let lastStatsTime = Date.now();
let geminiReady = false;
let reconnectAttempts = 0;
const MAX_RECONNECT_ATTEMPTS = 10;
// Track last successful Gemini request time to enforce cooldown
let lastGeminiRequestTime = 0;

// Last known Arduino address (set when we receive UDP packets)
let lastArduinoAddress = null;

// Gemini setup
let GenAI;
let model;

startServer();

function startServer() {
  initializeGemini();
  // Explicitly bind to all interfaces on the specified port
  server.bind(UDP_PORT, "0.0.0.0", () => {
    const address = server.address();
    console.log(`🚀 UDP Server binding to 0.0.0.0:${UDP_PORT}`);
  });
  server.on("listening", handleServerListening);
  server.on("message", handleIncomingAudioPacket);
  server.on("error", handleServerError);
  process.on("SIGINT", handleGracefulShutdown);
}

function handleServerListening() {
  const address = server.address();
  logServerStartup(address);
  console.log(`✓ Server bound to: ${address.address}:${address.port}`);
  silenceCheckInterval = setInterval(detectSilenceAndTranscribe, SILENCE_CHECK_INTERVAL_MS);
}

function handleIncomingAudioPacket(msg, rinfo) {
  const timestamp = new Date().toISOString();
  console.log(`📥 [${timestamp}] UDP packet: ${msg.length} bytes from ${rinfo.address}:${rinfo.port}`);

  beginSpeakingStateIfNeeded();
  lastPacketTime = Date.now();
  audioBuffer.push(Buffer.from(msg));
  // remember Arduino address so we can send downlink audio back to it
  lastArduinoAddress = rinfo.address;

  updateStatistics(msg.length);
  logStatisticsIfIntervalElapsed();
  
  // Send acknowledgment back to Arduino so it knows we received it
  const ackMessage = Buffer.from("ACK");
  server.send(ackMessage, 0, ackMessage.length, rinfo.port, rinfo.address, (err) => {
    if (err) {
      console.error(`❌ Failed to send ACK to ${rinfo.address}:${rinfo.port}:`, err.message);
    }
  });
}

function detectSilenceAndTranscribe() {
  if (currentState === STATE.SPEAKING && lastPacketTime) {
    const timeSinceLastPacket = Date.now() - lastPacketTime;
    if (timeSinceLastPacket > SILENCE_TIMEOUT_MS) {
      console.log(`🔇 Silence detected after ${timeSinceLastPacket} ms`);
      transitionToSilentAndProcessAudio();
    }
  }
}

function handleServerError(err) {
  console.error(`Server error:\n${err.stack}`);
  server.close();
}

function handleGracefulShutdown() {
  console.log("\n\nShutting down...");
  if (silenceCheckInterval) {
    clearInterval(silenceCheckInterval);
  }
  server.close(() => {
    console.log("Server closed");
    process.exit(0);
  });
}

function initializeGemini() {
  if (!process.env.GOOGLE_API_KEY) {
    console.error("⚠️  GOOGLE_API_KEY not set. Cannot connect to Gemini API.");
    console.error("   1. Get key from: https://aistudio.google.com/apikey");
    console.error("   2. Run: export GOOGLE_API_KEY='your-key-here'");
    process.exit(1);
  }

  try {
    const { GoogleGenerativeAI } = require("@google/generative-ai");
    GenAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY);
    model = GenAI.getGenerativeModel({ model: "gemini-2.0-flash" });
    geminiReady = true;
    console.log("🔌 Gemini 2.0 Flash model initialized successfully!");
    reconnectAttempts = 0;
  } catch (error) {
    console.error("❌ Failed to initialize Gemini:", error.message);
    if (error.message.includes("Cannot find module")) {
      console.error("\n⚠️  Missing dependency! Install with:");
      console.error("   npm install @google/generative-ai");
    }
    process.exit(1);
  }
}

function beginSpeakingStateIfNeeded() {
  if (currentState !== STATE.SPEAKING) {
    console.log("🎤 Speaking...");
    currentState = STATE.SPEAKING;
    audioBuffer = [];
  }
}

// Create WAV header for PCM16LE audio
function createWavBuffer(pcm16Data) {
  const numChannels = 1;
  const sampleRate = SAMPLE_RATE;
  const bitsPerSample = 16;
  const numSamples = pcm16Data.length / 2;
  
  const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
  const blockAlign = numChannels * (bitsPerSample / 8);
  const audioDataSize = pcm16Data.length;
  
  const wavSize = 36 + audioDataSize;
  const wav = Buffer.alloc(44 + audioDataSize);
  
  // WAV header
  wav.write("RIFF", 0);
  wav.writeUInt32LE(wavSize, 4);
  wav.write("WAVE", 8);
  wav.write("fmt ", 12);
  wav.writeUInt32LE(16, 16); // subchunk1 size
  wav.writeUInt16LE(1, 20);  // audio format (1 = PCM)
  wav.writeUInt16LE(numChannels, 22);
  wav.writeUInt32LE(sampleRate, 24);
  wav.writeUInt32LE(byteRate, 28);
  wav.writeUInt16LE(blockAlign, 32);
  wav.writeUInt16LE(bitsPerSample, 34);
  wav.write("data", 36);
  wav.writeUInt32LE(audioDataSize, 40);
  
  // Copy audio data
  pcm16Data.copy(wav, 44);
  
  return wav;
}

async function transitionToSilentAndProcessAudio() {
  if (currentState !== STATE.SILENT) {
    console.log("📤 Sent");
    currentState = STATE.SILENT;

    if (audioBuffer.length > 0 && !isProcessing && geminiReady) {
      isProcessing = true;
      console.log(`🔄 Processing ${audioBuffer.length} audio chunks with Gemini...`);
      await processAudioWithGemini();
      audioBuffer = [];
    }
  }
}

async function processAudioWithGemini() {
  try {
    // Combine audio buffers into single buffer
    const totalSize = audioBuffer.reduce((sum, buf) => sum + buf.length, 0);
    const audioData = Buffer.concat(audioBuffer, totalSize);

    // Estimate duration from bytes (16-bit samples)
    const totalSamples = audioData.length / (BITS_PER_SAMPLE / 8);
    const durationMs = (totalSamples / SAMPLE_RATE) * 1000;

    // Enforce minimum duration
    if (durationMs < MIN_REQUEST_DURATION_MS) {
      console.log(`⚠️ Skipping Gemini request — audio too short (${durationMs.toFixed(0)} ms)`);
      return;
    }

    // Enforce cooldown between requests to avoid quota burn
    const now = Date.now();
    if (now - lastGeminiRequestTime < REQUEST_COOLDOWN_MS) {
      const remain = REQUEST_COOLDOWN_MS - (now - lastGeminiRequestTime);
      console.log(`⚠️ Skipping Gemini request — cooldown active (${remain.toFixed(0)} ms remaining)`);
      return;
    }

    console.log(`🔄 Sending ${audioData.length} bytes (${durationMs.toFixed(0)} ms) of audio to Gemini...`);

    // Convert PCM16 to WAV format (Gemini supports audio/wav)
    const wavBuffer = createWavBuffer(audioData);
    const base64Audio = wavBuffer.toString("base64");

    // Send audio to Gemini with WAV mime type
    const response = await model.generateContent([
      {
        inlineData: {
          data: base64Audio,
          mimeType: "audio/wav",
        },
      },
      {
        text: "You are a friendly assistant. Listen to the audio and respond in 1-2 short sentences. Be conversational.",
      },
    ]);

    lastGeminiRequestTime = Date.now();

    const result = response.response;
    const responseText = result.text();

    if (responseText) {
      console.log(`💬 Gemini response: "${responseText}"`);
      await speakTextAloud(responseText);
    } else {
      console.log("⚠️  No response from Gemini");
    }
  } catch (error) {
    console.error("❌ Gemini processing error:", error.message);
    if (error.message.includes("400")) {
      console.error("   ℹ️  Gemini may not have received valid audio data");
    }
  } finally {
    isProcessing = false;
  }
}

async function speakTextAloud(text) {
  console.log(`🔊 Playing TTS for: "${text}"`);

  // Gemini 2.0 Flash does not yet return audio; use OpenAI TTS (or local macOS) instead.
  if (process.env.OPENAI_API_KEY) {
    await speakTextAloudOpenAI(text);
    return;
  }

  console.log("⚠️  OPENAI_API_KEY not set; falling back to local 'say' (macOS only)...");
  try {
    await playTextWithSay(text);
  } catch (err) {
    console.error("❌ Local TTS fallback failed:", err.message);
  }
}

// Convert arbitrary audio buffer to PCM16LE 16kHz mono using ffmpeg
function convertToPcm16Le(inputBuffer) {
  return new Promise((resolve, reject) => {
    const ffmpeg = spawn("ffmpeg", [
      "-i", "pipe:0",
      "-f", "s16le",
      "-acodec", "pcm_s16le",
      "-ac", "1",
      "-ar", String(SAMPLE_RATE),
      "pipe:1",
    ]);

    const outChunks = [];
    ffmpeg.stdout.on("data", (chunk) => outChunks.push(chunk));
    ffmpeg.stderr.on("data", () => {}); // suppress verbose logs

    ffmpeg.on("error", (err) => reject(err));
    ffmpeg.on("close", (code) => {
      if (code === 0) resolve(Buffer.concat(outChunks));
      else reject(new Error(`ffmpeg exited with ${code}`));
    });

    ffmpeg.stdin.write(inputBuffer);
    ffmpeg.stdin.end();
  });
}

// Send PCM16LE buffer to Arduino over UDP in CHUNK_SAMPLES-sized packets
function sendPcmToArduino(pcmBuffer) {
  return new Promise((resolve, reject) => {
    if (!lastArduinoAddress) return reject(new Error("No Arduino address known yet"));

    const chunkBytes = CHUNK_SAMPLES * (BITS_PER_SAMPLE / 8);
    let offset = 0;
    let sent = 0;

    // Send a small control packet to tell the ESP32 to switch to speaker mode
    const control = Buffer.from("PLAY");
    console.log(`▶️ Sending PLAY control to ${lastArduinoAddress}:${DOWNLINK_PORT}`);
    server.send(control, 0, control.length, DOWNLINK_PORT, lastArduinoAddress, (err) => {
      if (err) {
        console.error(`❌ Failed to send PLAY control to ${lastArduinoAddress}:${DOWNLINK_PORT}:`, err.message);
        return reject(err);
      }

      // small delay to let the ESP32 switch to speaker mode
      setTimeout(() => {
        function sendNext() {
          if (offset >= pcmBuffer.length) {
            console.log(`✓ Finished sending ${sent} downlink packets to ${lastArduinoAddress}:${DOWNLINK_PORT}`);
            return resolve(sent);
          }
          const end = Math.min(offset + chunkBytes, pcmBuffer.length);
          const chunk = pcmBuffer.slice(offset, end);
          server.send(chunk, 0, chunk.length, DOWNLINK_PORT, lastArduinoAddress, (err) => {
            if (err) return reject(err);
            sent++;
            offset = end;
            // small delay to avoid saturating network (1-2 ms)
            setTimeout(sendNext, 2);
          });
        }

        sendNext();
      }, 80);
    });
    
  });
}

async function speakTextAloudOpenAI(text) {
  try {
    // Exponential backoff retry for 429 / transient errors
    const MAX_RETRIES = 5;
    let attempt = 0;
    let waitMs = 1000;
    let lastErr = null;

    while (attempt < MAX_RETRIES) {
      attempt++;
      try {
        const response = await fetch("https://api.openai.com/v1/audio/speech", {
          method: "POST",
          headers: {
            Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
            "Content-Type": "application/json",
          },
          body: JSON.stringify({
            model: "tts-1",
            voice: "onyx",
            input: text,
            response_format: "wav",
          }),
        });

        if (response.ok) {
          const buffer = Buffer.from(await response.arrayBuffer());
          try {
            const pcm = await convertToPcm16Le(buffer);
            await sendPcmToArduino(pcm);
          } catch (err) {
            console.error("❌ Failed to convert/send OpenAI TTS to Arduino:", err.message);
            // fallback to local playback
            await playAudioBufferThroughSpeakers(buffer).catch(() => {});
          }
          return;
        }

        lastErr = `TTS API error: ${response.status} ${response.statusText}`;

        // If rate limited, respect Retry-After header if present
        if (response.status === 429) {
          const ra = response.headers.get("retry-after");
          if (ra) {
            const raMs = parseInt(ra, 10) * 1000;
            console.log(`⚠️  OpenAI TTS rate-limited. Respecting Retry-After: ${ra}s`);
            await new Promise((res) => setTimeout(res, raMs));
            continue;
          }
        }

        // For other 5xx errors, backoff and retry
        if (response.status >= 500 && response.status < 600) {
          console.log(`⚠️  Transient TTS error (${response.status}). Backing off ${waitMs}ms and retrying...`);
          await new Promise((res) => setTimeout(res, waitMs));
          waitMs *= 2;
          continue;
        }

        // Non-retriable error: break
        break;
      } catch (err) {
        lastErr = err.message;
        console.log(`⚠️  Network error when calling TTS: ${err.message}. Retrying in ${waitMs}ms...`);
        await new Promise((res) => setTimeout(res, waitMs));
        waitMs *= 2;
      }
    }

    // If we reached here, TTS failed. Fallback to local `say` if available.
    console.error(`❌ OpenAI TTS failed after ${attempt} attempts: ${lastErr}`);
    console.log("ℹ️  Falling back to local macOS 'say' if available...");
    try {
      await playTextWithSay(text);
    } catch (err) {
      console.error("❌ Local TTS fallback failed:", err.message);
    }
    
  } catch (error) {
    console.error("❌ OpenAI TTS unexpected error:", error.message);
  }
}

async function playTextWithSay(text) {
  return new Promise((resolve, reject) => {
    const isMac = process.platform === "darwin";
    if (!isMac) return reject(new Error("Local TTS 'say' only available on macOS"));

    const child = spawn("say", [text]);
    child.on("error", (err) => reject(err));
    child.on("close", (code) => {
      if (code === 0) resolve();
      else reject(new Error(`say exited with ${code}`));
    });
  });
}

async function playAudioBufferThroughSpeakers(buffer) {
  const ffplay = spawn("ffplay", ["-nodisp", "-autoexit", "-loglevel", "quiet", "-i", "pipe:0"]);

  ffplay.on("error", (err) => {
    console.error("❌ ffplay error:", err.message);
  });

  ffplay.on("close", (code) => {
    if (code === 0) {
      console.log("✓ TTS playback completed");
    } else {
      console.error(`❌ ffplay exited with code ${code}`);
    }
  });

  ffplay.stdin.write(buffer);
  ffplay.stdin.end();
}

function updateStatistics(messageLength) {
  packetsReceived++;
  bytesReceived += messageLength;
}

function logStatisticsIfIntervalElapsed() {
  const now = Date.now();
  if (now - lastStatsTime > STATS_INTERVAL_MS) {
    const elapsed = (now - lastStatsTime) / 1000;
    const packetsPerSec = (packetsReceived / elapsed).toFixed(1);
    const kbytesPerSec = (bytesReceived / elapsed / 1024).toFixed(2);
    const bufferSize = (audioBuffer.reduce((sum, buf) => sum + buf.length, 0) / 1024).toFixed(2);

    console.log(`📊 Stats: ${packetsPerSec} packets/s, ${kbytesPerSec} KB/s, buffer: ${bufferSize} KB`);

    packetsReceived = 0;
    bytesReceived = 0;
    lastStatsTime = now;
  }
}

function logServerStartup(address) {
  const networkAddresses = getNetworkAddresses();

  console.log("\n==============================================");
  console.log("ESP32 Audio UDP Receiver with Gemini API");
  console.log("==============================================");
  console.log(`UDP Server listening on port ${address.port}`);
  console.log(`Sample Rate: ${SAMPLE_RATE} Hz`);
  console.log(`Channels: ${CHANNELS} (mono)`);
  console.log(`Bits per sample: ${BITS_PER_SAMPLE}`);
  console.log(`Silence timeout: ${SILENCE_TIMEOUT_MS}ms`);
  console.log(`Google API Key: ${process.env.GOOGLE_API_KEY ? "✓ Set" : "✗ Not set"}`);
  console.log(`Model: Gemini 2.0 Flash (Audio + TTS)`);
  console.log(`TTS: Gemini (primary) | OpenAI (fallback if available)`);
  console.log("\n📡 Network Configuration:");
  console.log(`  Bound to: ${address.address}:${address.port}`);
  networkAddresses.forEach((addr) => {
    console.log(`  Available: ${addr}:${address.port}`);
  });
  console.log("\n⚠️  Troubleshooting Tips:");
  console.log("  1. Verify Arduino's server IP matches one of the 'Available' addresses above");
  console.log("  2. Check firewall allows UDP port 8888");
  console.log("  3. Verify GOOGLE_API_KEY is set: echo $GOOGLE_API_KEY");
  console.log("  4. For fallback TTS, also set: export OPENAI_API_KEY='your-key'");
  console.log("  5. Test with: nc -u <server-ip> 8888 (or send test UDP packets)");
  console.log("\nWaiting for ESP32 to send audio...");
  console.log("==============================================\n");
}

function getNetworkAddresses() {
  const interfaces = require("os").networkInterfaces();
  const addresses = [];

  for (const name of Object.keys(interfaces)) {
    for (const iface of interfaces[name]) {
      if (iface.family === "IPv4" && !iface.internal) {
        addresses.push(iface.address);
      }
    }
  }

  return addresses;
}
