For this week, I decided to use the XIAO ESP32-S3 microcontroller with a camera module to stream video data to my computer. Additionally, I developed an interface that allows toggling a button to recognize faces and display bounding boxes around them in real time. The process involved setting up the microcontroller, handling WebSocket communication, and working with the ESP-IDF framework for face detection. While challenging, I learned a lot about networking protocols and real-time data streaming.
Below is the code used for the ESP32-S3 camera server, which streams video data over Wi-Fi and enables face detection.
#include "esp_camera.h"
#include "esp_http_server.h"
#include
// WiFi credentials
const char *ssid = "YOUR_WIFI_SSID";
const char *password = "YOUR_WIFI_PASSWORD";
bool faceDetectionEnabled = false;
// Start camera server function
void startCameraServer();
void setup() {
Serial.begin(115200);
// Configure camera
camera_config_t config;
config.ledc_channel = LEDC_CHANNEL_0;
config.ledc_timer = LEDC_TIMER_0;
config.pin_d0 = Y2_GPIO_NUM;
config.pin_d1 = Y3_GPIO_NUM;
config.pin_d2 = Y4_GPIO_NUM;
config.pin_d3 = Y5_GPIO_NUM;
config.pin_d4 = Y6_GPIO_NUM;
config.pin_d5 = Y7_GPIO_NUM;
config.pin_d6 = Y8_GPIO_NUM;
config.pin_d7 = Y9_GPIO_NUM;
config.pin_xclk = XCLK_GPIO_NUM;
config.pin_pclk = PCLK_GPIO_NUM;
config.pin_vsync = VSYNC_GPIO_NUM;
config.pin_href = HREF_GPIO_NUM;
config.pin_sccb_sda = SIOD_GPIO_NUM;
config.pin_sccb_scl = SIOC_GPIO_NUM;
config.pin_pwdn = PWDN_GPIO_NUM;
config.pin_reset = RESET_GPIO_NUM;
config.xclk_freq_hz = 20000000;
config.pixel_format = PIXFORMAT_JPEG;
config.frame_size = FRAMESIZE_QVGA;
config.jpeg_quality = 10;
config.fb_count = 2;
if (esp_camera_init(&config) != ESP_OK) {
Serial.println("Camera init failed!");
return;
}
// Connect to Wi-Fi
WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
Serial.println("\nWi-Fi connected.");
Serial.print("Camera Stream URL: http://");
Serial.print(WiFi.localIP());
Serial.println("/stream");
// Start the camera server
startCameraServer();
}
void loop() {
delay(100);
}
// Stream handler
esp_err_t streamHandler(httpd_req_t *req) {
camera_fb_t *fb = NULL;
esp_err_t res = ESP_OK;
size_t _jpg_buf_len;
uint8_t *_jpg_buf;
char *part_buf[64];
static const char *stream_boundary = "--123456789000000000000987654321";
static const char *stream_content_type = "multipart/x-mixed-replace;boundary=123456789000000000000987654321";
res = httpd_resp_set_type(req, stream_content_type);
if (res != ESP_OK) return res;
while (true) {
fb = esp_camera_fb_get();
if (!fb) {
Serial.println("Camera capture failed");
res = ESP_FAIL;
} else {
if (fb->format != PIXFORMAT_JPEG) {
bool jpeg_converted = frame2jpg(fb, 80, &_jpg_buf, &_jpg_buf_len);
esp_camera_fb_return(fb);
if (!jpeg_converted) {
Serial.println("JPEG compression failed");
res = ESP_FAIL;
}
} else {
_jpg_buf_len = fb->len;
_jpg_buf = fb->buf;
}
if (res == ESP_OK) {
size_t hlen = snprintf((char *)part_buf, 64, "\r\n--%s\r\nContent-Type: image/jpeg\r\nContent-Length: %u\r\n\r\n", stream_boundary, _jpg_buf_len);
res = httpd_resp_send_chunk(req, (const char *)part_buf, hlen);
}
if (res == ESP_OK) {
res = httpd_resp_send_chunk(req, (const char *)_jpg_buf, _jpg_buf_len);
}
if (fb->format != PIXFORMAT_JPEG) {
free(_jpg_buf);
}
esp_camera_fb_return(fb);
if (res != ESP_OK) break;
}
}
return res;
}
// Face detection handler
esp_err_t controlHandler(httpd_req_t *req) {
char *buf;
size_t buf_len;
buf_len = httpd_req_get_url_query_len(req) + 1;
if (buf_len > 1) {
buf = (char *)malloc(buf_len);
httpd_req_get_url_query_str(req, buf, buf_len);
char param[16];
if (httpd_query_key_value(buf, "var", param, sizeof(param)) == ESP_OK) {
if (strcmp(param, "face_detect") == 0) {
char val[8];
if (httpd_query_key_value(buf, "val", val, sizeof(val)) == ESP_OK) {
faceDetectionEnabled = atoi(val);
Serial.printf("Face Detection: %s\n", faceDetectionEnabled ? "Enabled" : "Disabled");
}
}
}
free(buf);
}
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
return httpd_resp_send(req, NULL, 0);
}
// Start the camera server
void startCameraServer() {
httpd_config_t config = HTTPD_DEFAULT_CONFIG();
httpd_handle_t server = NULL;
httpd_uri_t stream_uri = {
.uri = "/stream",
.method = HTTP_GET,
.handler = streamHandler,
.user_ctx = NULL
};
httpd_uri_t control_uri = {
.uri = "/control",
.method = HTTP_GET,
.handler = controlHandler,
.user_ctx = NULL
};
if (httpd_start(&server, &config) == ESP_OK) {
httpd_register_uri_handler(server, &stream_uri);
httpd_register_uri_handler(server, &control_uri);
}
}
The application interface is implemented as a web server hosted on the ESP32-S3. It streams the video feed and processes frames to detect and highlight faces.
#if CONFIG_ESP_FACE_RECOGNITION_ENABLED
#pragma GCC diagnostic ignored "-Wformat"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "face_recognition_tool.hpp"
#include "face_recognition_112_v1_s16.hpp"
#include "face_recognition_112_v1_s8.hpp"
#pragma GCC diagnostic error "-Wformat"
#pragma GCC diagnostic warning "-Wstrict-aliasing"
#define QUANT_TYPE 0 //if set to 1 => very large firmware, very slow, reboots when streaming...
#define FACE_ID_SAVE_NUMBER 7
#endif
#define FACE_COLOR_WHITE 0x00FFFFFF
#define FACE_COLOR_BLACK 0x00000000
#define FACE_COLOR_RED 0x000000FF
#define FACE_COLOR_GREEN 0x0000FF00
#define FACE_COLOR_BLUE 0x00FF0000
#define FACE_COLOR_YELLOW (FACE_COLOR_RED | FACE_COLOR_GREEN)
#define FACE_COLOR_CYAN (FACE_COLOR_BLUE | FACE_COLOR_GREEN)
#define FACE_COLOR_PURPLE (FACE_COLOR_BLUE | FACE_COLOR_RED)
#endif
// Enable LED FLASH setting
#define CONFIG_LED_ILLUMINATOR_ENABLED 1
// LED FLASH setup
#if CONFIG_LED_ILLUMINATOR_ENABLED
#define LED_LEDC_GPIO 22 //configure LED pin
#define CONFIG_LED_MAX_INTENSITY 255
int led_duty = 0;
bool isStreaming = false;
#endif
typedef struct {
httpd_req_t *req;
size_t len;
} jpg_chunking_t;
#define PART_BOUNDARY "123456789000000000000987654321"
static const char *_STREAM_CONTENT_TYPE = "multipart/x-mixed-replace;boundary=" PART_BOUNDARY;
static const char *_STREAM_BOUNDARY = "\r\n--" PART_BOUNDARY "\r\n";
static const char *_STREAM_PART = "Content-Type: image/jpeg\r\nContent-Length: %u\r\nX-Timestamp: %d.%06d\r\n\r\n";
httpd_handle_t stream_httpd = NULL;
httpd_handle_t camera_httpd = NULL;
#if CONFIG_ESP_FACE_DETECT_ENABLED
static int8_t detection_enabled = 0;
// #if TWO_STAGE
// static HumanFaceDetectMSR01 s1(0.1F, 0.5F, 10, 0.2F);
// static HumanFaceDetectMNP01 s2(0.5F, 0.3F, 5);
// #else
// static HumanFaceDetectMSR01 s1(0.3F, 0.5F, 10, 0.2F);
// #endif
#if CONFIG_ESP_FACE_RECOGNITION_ENABLED
static int8_t recognition_enabled = 0;
static int8_t is_enrolling = 0;
#if QUANT_TYPE
// S16 model
FaceRecognition112V1S16 recognizer;
#else
// S8 model
FaceRecognition112V1S8 recognizer;
#endif
#endif
#endif
typedef struct {
size_t size; //number of values used for filtering
size_t index; //current value index
size_t count; //value count
int sum;
int *values; //array to be filled with values
} ra_filter_t;
static ra_filter_t ra_filter;
static ra_filter_t *ra_filter_init(ra_filter_t *filter, size_t sample_size) {
memset(filter, 0, sizeof(ra_filter_t));
filter->values = (int *)malloc(sample_size * sizeof(int));
if (!filter->values) {
return NULL;
}
memset(filter->values, 0, sample_size * sizeof(int));
filter->size = sample_size;
return filter;
}
#if ARDUHAL_LOG_LEVEL >= ARDUHAL_LOG_LEVEL_INFO
static int ra_filter_run(ra_filter_t *filter, int value) {
if (!filter->values) {
return value;
}
filter->sum -= filter->values[filter->index];
filter->values[filter->index] = value;
filter->sum += filter->values[filter->index];
filter->index++;
filter->index = filter->index % filter->size;
if (filter->count < filter->size) {
filter->count++;
}
return filter->sum / filter->count;
}
#endif
#if CONFIG_ESP_FACE_DETECT_ENABLED
#if CONFIG_ESP_FACE_RECOGNITION_ENABLED
static void rgb_print(fb_data_t *fb, uint32_t color, const char *str) {
fb_gfx_print(fb, (fb->width - (strlen(str) * 14)) / 2, 10, color, str);
}
static int rgb_printf(fb_data_t *fb, uint32_t color, const char *format, ...) {
char loc_buf[64];
char *temp = loc_buf;
int len;
va_list arg;
va_list copy;
va_start(arg, format);
va_copy(copy, arg);
len = vsnprintf(loc_buf, sizeof(loc_buf), format, arg);
va_end(copy);
if (len >= sizeof(loc_buf)) {
temp = (char *)malloc(len + 1);
if (temp == NULL) {
return 0;
}
}
vsnprintf(temp, len + 1, format, arg);
va_end(arg);
rgb_print(fb, color, temp);
if (len > 64) {
free(temp);
}
return len;
}
#endif
static void draw_face_boxes(fb_data_t *fb, std::list *results, int face_id) {
int x, y, w, h;
uint32_t color = FACE_COLOR_YELLOW;
if (face_id < 0) {
color = FACE_COLOR_RED;
} else if (face_id > 0) {
color = FACE_COLOR_GREEN;
}
if (fb->bytes_per_pixel == 2) {
//color = ((color >> 8) & 0xF800) | ((color >> 3) & 0x07E0) | (color & 0x001F);
color = ((color >> 16) & 0x001F) | ((color >> 3) & 0x07E0) | ((color << 8) & 0xF800);
}
int i = 0;
for (std::list::iterator prediction = results->begin(); prediction != results->end(); prediction++, i++) {
// rectangle box
x = (int)prediction->box[0];
y = (int)prediction->box[1];
w = (int)prediction->box[2] - x + 1;
h = (int)prediction->box[3] - y + 1;
if ((x + w) > fb->width) {
w = fb->width - x;
}
if ((y + h) > fb->height) {
h = fb->height - y;
}
fb_gfx_drawFastHLine(fb, x, y, w, color);
fb_gfx_drawFastHLine(fb, x, y + h - 1, w, color);
fb_gfx_drawFastVLine(fb, x, y, h, color);
fb_gfx_drawFastVLine(fb, x + w - 1, y, h, color);
#if TWO_STAGE
// landmarks (left eye, mouth left, nose, right eye, mouth right)
int x0, y0, j;
for (j = 0; j < 10; j += 2) {
x0 = (int)prediction->keypoint[j];
y0 = (int)prediction->keypoint[j + 1];
fb_gfx_fillRect(fb, x0, y0, 3, 3, color);
}
#endif
}
}
#if CONFIG_ESP_FACE_RECOGNITION_ENABLED
static int run_face_recognition(fb_data_t *fb, std::list *results) {
std::vector landmarks = results->front().keypoint;
int id = -1;
Tensor tensor;
tensor.set_element((uint8_t *)fb->data).set_shape({fb->height, fb->width, 3}).set_auto_free(false);
int enrolled_count = recognizer.get_enrolled_id_num();
if (enrolled_count < FACE_ID_SAVE_NUMBER && is_enrolling) {
id = recognizer.enroll_id(tensor, landmarks, "", true);
log_i("Enrolled ID: %d", id);
rgb_printf(fb, FACE_COLOR_CYAN, "ID[%u]", id);
}
face_info_t recognize = recognizer.recognize(tensor, landmarks);
if (recognize.id >= 0) {
rgb_printf(fb, FACE_COLOR_GREEN, "ID[%u]: %.2f", recognize.id, recognize.similarity);
} else {
rgb_print(fb, FACE_COLOR_RED, "Intruder Alert!");
}
return recognize.id;
}
#endif
#endif
The camera successfully streamed video data, and face detection worked as intended, albeit with some lag and moderate image quality.