kpu_run_model function idles when using another kmodel
-
Hello,
I'm trying to perform inference with my own kmodel on Sipeed M1.
Here is my main, inspired by the face_detect from the kendryte-standalone-demo:
#include <stdio.h> #include <string.h> #include <unistd.h> #include <stdlib.h> #include <math.h> #include "bsp.h" #include "sysctl.h" #include "plic.h" #include "utils.h" #include "gpiohs.h" #include "fpioa.h" #include "lcd.h" #include "nt35310.h" #include "dvp.h" #include "ov5640.h" #include "ov2640.h" #include "uarths.h" #include "kpu.h" #include "detector.h" #include "image_process.h" #include "board_config.h" #include "w25qxx.h" #define INCBIN_STYLE INCBIN_STYLE_SNAKE #define INCBIN_PREFIX #include "incbin.h" #define PLL0_OUTPUT_FREQ 800000000UL #define PLL1_OUTPUT_FREQ 400000000UL volatile uint32_t g_ai_done_flag; volatile uint8_t g_dvp_finish_flag; static image_t kpu_image, display_image; kpu_model_context_t net_task; static region_layer_t detect_rl; static obj_info_t face_detect_info; #define ANCHOR_NUM 25 #define LOAD_KMODEL_FROM_FLASH 0 #if LOAD_KMODEL_FROM_FLASH #define KMODEL_SIZE (250 * 1024) uint8_t model_data[KMODEL_SIZE]; #else INCBIN(model, "detect.kmodel"); #endif static void ai_done(void *ctx) { g_ai_done_flag = 1; } static int dvp_irq(void *ctx) { if (dvp_get_interrupt(DVP_STS_FRAME_FINISH)) { dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 0); dvp_clear_interrupt(DVP_STS_FRAME_FINISH); g_dvp_finish_flag = 1; } else { dvp_start_convert(); dvp_clear_interrupt(DVP_STS_FRAME_START); } return 0; } static void io_mux_init(void) { printf("Detected : K210"); /* Init DVP IO map and function settings */ fpioa_set_function(42, FUNC_CMOS_RST); fpioa_set_function(44, FUNC_CMOS_PWDN); fpioa_set_function(46, FUNC_CMOS_XCLK); fpioa_set_function(43, FUNC_CMOS_VSYNC); fpioa_set_function(45, FUNC_CMOS_HREF); fpioa_set_function(47, FUNC_CMOS_PCLK); fpioa_set_function(41, FUNC_SCCB_SCLK); fpioa_set_function(40, FUNC_SCCB_SDA); /* Init SPI IO map and function settings */ fpioa_set_function(38, FUNC_GPIOHS0 + DCX_GPIONUM); fpioa_set_function(36, FUNC_SPI0_SS3); fpioa_set_function(39, FUNC_SPI0_SCLK); fpioa_set_function(37, FUNC_GPIOHS0 + RST_GPIONUM); sysctl_set_spi0_dvp_data(1); } static void io_set_power(void) { /* Set dvp and spi pin to 1.8V */ sysctl_set_power_mode(SYSCTL_POWER_BANK6, SYSCTL_POWER_V18); sysctl_set_power_mode(SYSCTL_POWER_BANK7, SYSCTL_POWER_V18); } static void draw_edge(uint32_t *gram, obj_info_t *obj_info, uint32_t index, uint16_t color) { uint32_t data = ((uint32_t)color << 16) | (uint32_t)color; uint32_t *addr1, *addr2, *addr3, *addr4, x1, y1, x2, y2; x1 = obj_info->obj[index].x1; y1 = obj_info->obj[index].y1; x2 = obj_info->obj[index].x2; y2 = obj_info->obj[index].y2; if (x1 <= 0) x1 = 1; if (x2 >= 319) x2 = 318; if (y1 <= 0) y1 = 1; if (y2 >= 239) y2 = 238; addr1 = gram + (320 * y1 + x1) / 2; addr2 = gram + (320 * y1 + x2 - 8) / 2; addr3 = gram + (320 * (y2 - 1) + x1) / 2; addr4 = gram + (320 * (y2 - 1) + x2 - 8) / 2; for (uint32_t i = 0; i < 4; i++) { *addr1 = data; *(addr1 + 160) = data; *addr2 = data; *(addr2 + 160) = data; *addr3 = data; *(addr3 + 160) = data; *addr4 = data; *(addr4 + 160) = data; addr1++; addr2++; addr3++; addr4++; } addr1 = gram + (320 * y1 + x1) / 2; addr2 = gram + (320 * y1 + x2 - 2) / 2; addr3 = gram + (320 * (y2 - 8) + x1) / 2; addr4 = gram + (320 * (y2 - 8) + x2 - 2) / 2; for (uint32_t i = 0; i < 8; i++) { *addr1 = data; *addr2 = data; *addr3 = data; *addr4 = data; addr1 += 160; addr2 += 160; addr3 += 160; addr4 += 160; } } int main(void) { /* Set CPU and dvp clk */ sysctl_pll_set_freq(SYSCTL_PLL0, PLL0_OUTPUT_FREQ); sysctl_pll_set_freq(SYSCTL_PLL1, PLL1_OUTPUT_FREQ); sysctl_clock_enable(SYSCTL_CLOCK_AI); uarths_init(); io_mux_init(); io_set_power(); plic_init(); /* flash init */ printf("flash init\n"); w25qxx_init(3, 0); w25qxx_enable_quad_mode(); #if LOAD_KMODEL_FROM_FLASH w25qxx_read_data(0xA00000, model_data, KMODEL_SIZE, W25QXX_QUAD_FAST); #endif /* LCD init */ printf("LCD init\n"); lcd_init(); lcd_set_direction(DIR_YX_RLDU); lcd_clear(BLACK); /* DVP init */ printf("DVP init\n"); printf("Found camera OV2640\n"); dvp_init(8); dvp_set_xclk_rate(24000000); dvp_enable_burst(); dvp_set_output_enable(0, 1); dvp_set_output_enable(1, 1); dvp_set_image_format(DVP_CFG_RGB_FORMAT); dvp_set_image_size(320, 240); ov2640_init(); kpu_image.pixel = 3; kpu_image.width = 320; kpu_image.height = 240; image_init(&kpu_image); display_image.pixel = 3; display_image.width = 320; display_image.height = 240; image_init(&display_image); dvp_set_ai_addr((uint32_t)kpu_image.addr, (uint32_t)(kpu_image.addr + 320 * 240), (uint32_t)(kpu_image.addr + 320 * 240 * 2)); dvp_set_display_addr((uint32_t)display_image.addr); dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 0); dvp_disable_auto(); /* DVP interrupt config */ printf("DVP interrupt config\n"); plic_set_priority(IRQN_DVP_INTERRUPT, 1); plic_irq_register(IRQN_DVP_INTERRUPT, dvp_irq, NULL); plic_irq_enable(IRQN_DVP_INTERRUPT); /* init face detect model */ if (kpu_load_kmodel(&net_task, model_data) != 0) { printf("\nmodel init error\n"); while (1); } detect_rl.anchor_number = ANCHOR_NUM; //detect_rl.anchor = anchor; detect_rl.threshold = 0.7; detect_rl.nms_value = 0.3; //region_layer_init(&detect_rl, 20, 15, 125, 320, 240); /* enable global interrupt */ sysctl_enable_irq(); /* system start */ printf("System start\n"); while (1) { g_dvp_finish_flag = 0; dvp_clear_interrupt(DVP_STS_FRAME_START | DVP_STS_FRAME_FINISH); dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 1); while (g_dvp_finish_flag == 0) ; /* run face detect */ g_ai_done_flag = 0; printf("Run model...\n"); kpu_run_kmodel(&net_task, kpu_image.addr, DMAC_CHANNEL5, ai_done, NULL); printf("Waiting for AI to finish...\n"); while(!g_ai_done_flag); printf("AI Done.\n"); /* display picture */ lcd_draw_picture(0, 0, 320, 240, (uint32_t *)display_image.addr); } }
The detect.kmodel from the face_detect example works perfectly.
However, when I try to use my own kmodel, the program get stuck at line 238 (g_ai_done_flag is never set to 1).What could be the source of the problem? Is there a workaround?
Thank you in advance for your help!
-
Can you try with the newest nncase and SDK?
-
@sunnycase No, the largest number of output channel is 300. My input dimension is 320x240x3, and the output is 20x15x125. After padding the input image, the largest input of a Conv2D layer is 322x242x3, and ncc transforms it to kmodel without problems.
-
Is there any conv2d layer in your model has output features more than 1024 channels?
-
@latyas The program stays in the "while(!g_ai_done_flag)" loop at line 238. The ai_done callback (defined at line 47) is never called.
-
@sni What does the term "stuck" mean? Did the chip hang and can't contact with JTAG or just be in some loop?