Week 12: Application Interfaces


For this week, I decided to use the XIAO ESP32-S3 microcontroller with a camera module to stream video data to my computer. Additionally, I developed an interface that allows toggling a button to recognize faces and display bounding boxes around them in real time. The process involved setting up the microcontroller, handling WebSocket communication, and working with the ESP-IDF framework for face detection. While challenging, I learned a lot about networking protocols and real-time data streaming.

Challenges & Solutions

Code for Camera Server

Below is the code used for the ESP32-S3 camera server, which streams video data over Wi-Fi and enables face detection.

        #include "esp_camera.h"
        #include "esp_http_server.h"
        // WiFi credentials
        const char *ssid = "YOUR_WIFI_SSID";
        const char *password = "YOUR_WIFI_PASSWORD";
        bool faceDetectionEnabled = false;
        // Start camera server function
        void startCameraServer();
        void setup() {
            // Configure camera
            camera_config_t config;
            config.ledc_channel = LEDC_CHANNEL_0;
            config.ledc_timer = LEDC_TIMER_0;
            config.pin_d0 = Y2_GPIO_NUM;
            config.pin_d1 = Y3_GPIO_NUM;
            config.pin_d2 = Y4_GPIO_NUM;
            config.pin_d3 = Y5_GPIO_NUM;
            config.pin_d4 = Y6_GPIO_NUM;
            config.pin_d5 = Y7_GPIO_NUM;
            config.pin_d6 = Y8_GPIO_NUM;
            config.pin_d7 = Y9_GPIO_NUM;
            config.pin_xclk = XCLK_GPIO_NUM;
            config.pin_pclk = PCLK_GPIO_NUM;
            config.pin_vsync = VSYNC_GPIO_NUM;
            config.pin_href = HREF_GPIO_NUM;
            config.pin_sccb_sda = SIOD_GPIO_NUM;
            config.pin_sccb_scl = SIOC_GPIO_NUM;
            config.pin_pwdn = PWDN_GPIO_NUM;
            config.pin_reset = RESET_GPIO_NUM;
            config.xclk_freq_hz = 20000000;
            config.pixel_format = PIXFORMAT_JPEG;
            config.frame_size = FRAMESIZE_QVGA;
            config.jpeg_quality = 10;
            config.fb_count = 2;
            if (esp_camera_init(&config) != ESP_OK) {
                Serial.println("Camera init failed!");
            // Connect to Wi-Fi
            WiFi.begin(ssid, password);
            while (WiFi.status() != WL_CONNECTED) {
            Serial.println("\nWi-Fi connected.");
            Serial.print("Camera Stream URL: http://");
            // Start the camera server
        void loop() {
        // Stream handler
        esp_err_t streamHandler(httpd_req_t *req) {
            camera_fb_t *fb = NULL;
            esp_err_t res = ESP_OK;
            size_t _jpg_buf_len;
            uint8_t *_jpg_buf;
            char *part_buf[64];
            static const char *stream_boundary = "--123456789000000000000987654321";
            static const char *stream_content_type = "multipart/x-mixed-replace;boundary=123456789000000000000987654321";
            res = httpd_resp_set_type(req, stream_content_type);
            if (res != ESP_OK) return res;
            while (true) {
                fb = esp_camera_fb_get();
                if (!fb) {
                    Serial.println("Camera capture failed");
                    res = ESP_FAIL;
                } else {
                    if (fb->format != PIXFORMAT_JPEG) {
                        bool jpeg_converted = frame2jpg(fb, 80, &_jpg_buf, &_jpg_buf_len);
                        if (!jpeg_converted) {
                            Serial.println("JPEG compression failed");
                            res = ESP_FAIL;
                    } else {
                        _jpg_buf_len = fb->len;
                        _jpg_buf = fb->buf;
                    if (res == ESP_OK) {
                        size_t hlen = snprintf((char *)part_buf, 64, "\r\n--%s\r\nContent-Type: image/jpeg\r\nContent-Length: %u\r\n\r\n", stream_boundary, _jpg_buf_len);
                        res = httpd_resp_send_chunk(req, (const char *)part_buf, hlen);
                    if (res == ESP_OK) {
                        res = httpd_resp_send_chunk(req, (const char *)_jpg_buf, _jpg_buf_len);
                    if (fb->format != PIXFORMAT_JPEG) {
                    if (res != ESP_OK) break;
            return res;
        // Face detection handler
        esp_err_t controlHandler(httpd_req_t *req) {
            char *buf;
            size_t buf_len;
            buf_len = httpd_req_get_url_query_len(req) + 1;
            if (buf_len > 1) {
                buf = (char *)malloc(buf_len);
                httpd_req_get_url_query_str(req, buf, buf_len);
                char param[16];
                if (httpd_query_key_value(buf, "var", param, sizeof(param)) == ESP_OK) {
                    if (strcmp(param, "face_detect") == 0) {
                        char val[8];
                        if (httpd_query_key_value(buf, "val", val, sizeof(val)) == ESP_OK) {
                            faceDetectionEnabled = atoi(val);
                            Serial.printf("Face Detection: %s\n", faceDetectionEnabled ? "Enabled" : "Disabled");
            httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
            return httpd_resp_send(req, NULL, 0);
        // Start the camera server
        void startCameraServer() {
            httpd_config_t config = HTTPD_DEFAULT_CONFIG();
            httpd_handle_t server = NULL;
            httpd_uri_t stream_uri = {
                .uri = "/stream",
                .method = HTTP_GET,
                .handler = streamHandler,
                .user_ctx = NULL
            httpd_uri_t control_uri = {
                .uri = "/control",
                .method = HTTP_GET,
                .handler = controlHandler,
                .user_ctx = NULL
            if (httpd_start(&server, &config) == ESP_OK) {
                httpd_register_uri_handler(server, &stream_uri);
                httpd_register_uri_handler(server, &control_uri);

Application Interface Code

The application interface is implemented as a web server hosted on the ESP32-S3. It streams the video feed and processes frames to detect and highlight faces.

        #pragma GCC diagnostic ignored "-Wformat"
        #pragma GCC diagnostic ignored "-Wstrict-aliasing"
        #include "face_recognition_tool.hpp"
        #include "face_recognition_112_v1_s16.hpp"
        #include "face_recognition_112_v1_s8.hpp"
        #pragma GCC diagnostic error "-Wformat"
        #pragma GCC diagnostic warning "-Wstrict-aliasing"
        #define QUANT_TYPE 0  //if set to 1 => very large firmware, very slow, reboots when streaming...
        #define FACE_ID_SAVE_NUMBER 7
        #define FACE_COLOR_WHITE  0x00FFFFFF
        #define FACE_COLOR_BLACK  0x00000000
        #define FACE_COLOR_RED    0x000000FF
        #define FACE_COLOR_GREEN  0x0000FF00
        #define FACE_COLOR_BLUE   0x00FF0000
        // Enable LED FLASH setting
        // LED FLASH setup
        #define LED_LEDC_GPIO            22  //configure LED pin
        #define CONFIG_LED_MAX_INTENSITY 255
        int led_duty = 0;
        bool isStreaming = false;
        typedef struct {
          httpd_req_t *req;
          size_t len;
        } jpg_chunking_t;
        #define PART_BOUNDARY "123456789000000000000987654321"
        static const char *_STREAM_CONTENT_TYPE = "multipart/x-mixed-replace;boundary=" PART_BOUNDARY;
        static const char *_STREAM_BOUNDARY = "\r\n--" PART_BOUNDARY "\r\n";
        static const char *_STREAM_PART = "Content-Type: image/jpeg\r\nContent-Length: %u\r\nX-Timestamp: %d.%06d\r\n\r\n";
        httpd_handle_t stream_httpd = NULL;
        httpd_handle_t camera_httpd = NULL;
        static int8_t detection_enabled = 0;
        // #if TWO_STAGE
        // static HumanFaceDetectMSR01 s1(0.1F, 0.5F, 10, 0.2F);
        // static HumanFaceDetectMNP01 s2(0.5F, 0.3F, 5);
        // #else
        // static HumanFaceDetectMSR01 s1(0.3F, 0.5F, 10, 0.2F);
        // #endif
        static int8_t recognition_enabled = 0;
        static int8_t is_enrolling = 0;
        #if QUANT_TYPE
        // S16 model
        FaceRecognition112V1S16 recognizer;
        // S8 model
        FaceRecognition112V1S8 recognizer;
        typedef struct {
          size_t size;   //number of values used for filtering
          size_t index;  //current value index
          size_t count;  //value count
          int sum;
          int *values;  //array to be filled with values
        } ra_filter_t;
        static ra_filter_t ra_filter;
        static ra_filter_t *ra_filter_init(ra_filter_t *filter, size_t sample_size) {
          memset(filter, 0, sizeof(ra_filter_t));
          filter->values = (int *)malloc(sample_size * sizeof(int));
          if (!filter->values) {
            return NULL;
          memset(filter->values, 0, sample_size * sizeof(int));
          filter->size = sample_size;
          return filter;
        static int ra_filter_run(ra_filter_t *filter, int value) {
          if (!filter->values) {
            return value;
          filter->sum -= filter->values[filter->index];
          filter->values[filter->index] = value;
          filter->sum += filter->values[filter->index];
          filter->index = filter->index % filter->size;
          if (filter->count < filter->size) {
          return filter->sum / filter->count;
        static void rgb_print(fb_data_t *fb, uint32_t color, const char *str) {
          fb_gfx_print(fb, (fb->width - (strlen(str) * 14)) / 2, 10, color, str);
        static int rgb_printf(fb_data_t *fb, uint32_t color, const char *format, ...) {
          char loc_buf[64];
          char *temp = loc_buf;
          int len;
          va_list arg;
          va_list copy;
          va_start(arg, format);
          va_copy(copy, arg);
          len = vsnprintf(loc_buf, sizeof(loc_buf), format, arg);
          if (len >= sizeof(loc_buf)) {
            temp = (char *)malloc(len + 1);
            if (temp == NULL) {
              return 0;
          vsnprintf(temp, len + 1, format, arg);
          rgb_print(fb, color, temp);
          if (len > 64) {
          return len;
        static void draw_face_boxes(fb_data_t *fb, std::list *results, int face_id) {
          int x, y, w, h;
          uint32_t color = FACE_COLOR_YELLOW;
          if (face_id < 0) {
            color = FACE_COLOR_RED;
          } else if (face_id > 0) {
            color = FACE_COLOR_GREEN;
          if (fb->bytes_per_pixel == 2) {
            //color = ((color >> 8) & 0xF800) | ((color >> 3) & 0x07E0) | (color & 0x001F);
            color = ((color >> 16) & 0x001F) | ((color >> 3) & 0x07E0) | ((color << 8) & 0xF800);
          int i = 0;
          for (std::list::iterator prediction = results->begin(); prediction != results->end(); prediction++, i++) {
            // rectangle box
            x = (int)prediction->box[0];
            y = (int)prediction->box[1];
            w = (int)prediction->box[2] - x + 1;
            h = (int)prediction->box[3] - y + 1;
            if ((x + w) > fb->width) {
              w = fb->width - x;
            if ((y + h) > fb->height) {
              h = fb->height - y;
            fb_gfx_drawFastHLine(fb, x, y, w, color);
            fb_gfx_drawFastHLine(fb, x, y + h - 1, w, color);
            fb_gfx_drawFastVLine(fb, x, y, h, color);
            fb_gfx_drawFastVLine(fb, x + w - 1, y, h, color);
        #if TWO_STAGE
            // landmarks (left eye, mouth left, nose, right eye, mouth right)
            int x0, y0, j;
            for (j = 0; j < 10; j += 2) {
              x0 = (int)prediction->keypoint[j];
              y0 = (int)prediction->keypoint[j + 1];
              fb_gfx_fillRect(fb, x0, y0, 3, 3, color);
        static int run_face_recognition(fb_data_t *fb, std::list *results) {
          std::vector landmarks = results->front().keypoint;
          int id = -1;
          Tensor tensor;
          tensor.set_element((uint8_t *)fb->data).set_shape({fb->height, fb->width, 3}).set_auto_free(false);
          int enrolled_count = recognizer.get_enrolled_id_num();
          if (enrolled_count < FACE_ID_SAVE_NUMBER && is_enrolling) {
            id = recognizer.enroll_id(tensor, landmarks, "", true);
            log_i("Enrolled ID: %d", id);
            rgb_printf(fb, FACE_COLOR_CYAN, "ID[%u]", id);
          face_info_t recognize = recognizer.recognize(tensor, landmarks);
          if (recognize.id >= 0) {
            rgb_printf(fb, FACE_COLOR_GREEN, "ID[%u]: %.2f", recognize.id, recognize.similarity);
          } else {
            rgb_print(fb, FACE_COLOR_RED, "Intruder Alert!");
          return recognize.id;

Videos & Results

The camera successfully streamed video data, and face detection worked as intended, albeit with some lag and moderate image quality.

XIAO ESP32-S3 Microcontroller Face Detection in Action