kpu_run_model function idles when using another kmodel



  • Hello,

    I'm trying to perform inference with my own kmodel on Sipeed M1.

    Here is my main, inspired by the face_detect from the kendryte-standalone-demo:

    #include <stdio.h>
    #include <string.h>
    #include <unistd.h>
    #include <stdlib.h>
    #include <math.h>
    #include "bsp.h"
    #include "sysctl.h"
    #include "plic.h"
    #include "utils.h"
    #include "gpiohs.h"
    #include "fpioa.h"
    #include "lcd.h"
    #include "nt35310.h"
    #include "dvp.h"
    #include "ov5640.h"
    #include "ov2640.h"
    #include "uarths.h"
    #include "kpu.h"
    #include "detector.h"
    #include "image_process.h"
    #include "board_config.h"
    #include "w25qxx.h"
    #define INCBIN_STYLE INCBIN_STYLE_SNAKE
    #define INCBIN_PREFIX
    #include "incbin.h"
    
    #define PLL0_OUTPUT_FREQ 800000000UL
    #define PLL1_OUTPUT_FREQ 400000000UL
    
    volatile uint32_t g_ai_done_flag;
    volatile uint8_t g_dvp_finish_flag;
    static image_t kpu_image, display_image;
    
    kpu_model_context_t net_task;
    static region_layer_t detect_rl;
    static obj_info_t face_detect_info;
    #define ANCHOR_NUM 25
    #define  LOAD_KMODEL_FROM_FLASH 0
    
    #if LOAD_KMODEL_FROM_FLASH
    #define KMODEL_SIZE (250 * 1024)
    uint8_t model_data[KMODEL_SIZE];
    #else
    INCBIN(model, "detect.kmodel");
    #endif
    
    static void ai_done(void *ctx)
    {
        g_ai_done_flag = 1;
    }
    
    static int dvp_irq(void *ctx)
    {
        if (dvp_get_interrupt(DVP_STS_FRAME_FINISH))
        {
            dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 0);
            dvp_clear_interrupt(DVP_STS_FRAME_FINISH);
            g_dvp_finish_flag = 1;
        }
        else
        {
            dvp_start_convert();
            dvp_clear_interrupt(DVP_STS_FRAME_START);
        }
        return 0;
    }
    
    static void io_mux_init(void)
    {
    
        printf("Detected : K210");
        /* Init DVP IO map and function settings */
        fpioa_set_function(42, FUNC_CMOS_RST);
        fpioa_set_function(44, FUNC_CMOS_PWDN);
        fpioa_set_function(46, FUNC_CMOS_XCLK);
        fpioa_set_function(43, FUNC_CMOS_VSYNC);
        fpioa_set_function(45, FUNC_CMOS_HREF);
        fpioa_set_function(47, FUNC_CMOS_PCLK);
        fpioa_set_function(41, FUNC_SCCB_SCLK);
        fpioa_set_function(40, FUNC_SCCB_SDA);
    
        /* Init SPI IO map and function settings */
        fpioa_set_function(38, FUNC_GPIOHS0 + DCX_GPIONUM);
        fpioa_set_function(36, FUNC_SPI0_SS3);
        fpioa_set_function(39, FUNC_SPI0_SCLK);
        fpioa_set_function(37, FUNC_GPIOHS0 + RST_GPIONUM);
    
        sysctl_set_spi0_dvp_data(1);
    }
    
    static void io_set_power(void)
    {
        /* Set dvp and spi pin to 1.8V */
        sysctl_set_power_mode(SYSCTL_POWER_BANK6, SYSCTL_POWER_V18);
        sysctl_set_power_mode(SYSCTL_POWER_BANK7, SYSCTL_POWER_V18);
    }
    
    static void draw_edge(uint32_t *gram, obj_info_t *obj_info, uint32_t index, uint16_t color)
    {
        uint32_t data = ((uint32_t)color << 16) | (uint32_t)color;
        uint32_t *addr1, *addr2, *addr3, *addr4, x1, y1, x2, y2;
    
        x1 = obj_info->obj[index].x1;
        y1 = obj_info->obj[index].y1;
        x2 = obj_info->obj[index].x2;
        y2 = obj_info->obj[index].y2;
    
        if (x1 <= 0)
            x1 = 1;
        if (x2 >= 319)
            x2 = 318;
        if (y1 <= 0)
            y1 = 1;
        if (y2 >= 239)
            y2 = 238;
    
        addr1 = gram + (320 * y1 + x1) / 2;
        addr2 = gram + (320 * y1 + x2 - 8) / 2;
        addr3 = gram + (320 * (y2 - 1) + x1) / 2;
        addr4 = gram + (320 * (y2 - 1) + x2 - 8) / 2;
        for (uint32_t i = 0; i < 4; i++)
        {
            *addr1 = data;
            *(addr1 + 160) = data;
            *addr2 = data;
            *(addr2 + 160) = data;
            *addr3 = data;
            *(addr3 + 160) = data;
            *addr4 = data;
            *(addr4 + 160) = data;
            addr1++;
            addr2++;
            addr3++;
            addr4++;
        }
        addr1 = gram + (320 * y1 + x1) / 2;
        addr2 = gram + (320 * y1 + x2 - 2) / 2;
        addr3 = gram + (320 * (y2 - 8) + x1) / 2;
        addr4 = gram + (320 * (y2 - 8) + x2 - 2) / 2;
        for (uint32_t i = 0; i < 8; i++)
        {
            *addr1 = data;
            *addr2 = data;
            *addr3 = data;
            *addr4 = data;
            addr1 += 160;
            addr2 += 160;
            addr3 += 160;
            addr4 += 160;
        }
    }
    
    int main(void)
    {
        /* Set CPU and dvp clk */
        sysctl_pll_set_freq(SYSCTL_PLL0, PLL0_OUTPUT_FREQ);
        sysctl_pll_set_freq(SYSCTL_PLL1, PLL1_OUTPUT_FREQ);
        sysctl_clock_enable(SYSCTL_CLOCK_AI);
        uarths_init();
    
        io_mux_init();
        io_set_power();
        plic_init();
        /* flash init */
        printf("flash init\n");
        w25qxx_init(3, 0);
        w25qxx_enable_quad_mode();
    #if LOAD_KMODEL_FROM_FLASH
        w25qxx_read_data(0xA00000, model_data, KMODEL_SIZE, W25QXX_QUAD_FAST);
    #endif
        /* LCD init */
        printf("LCD init\n");
        lcd_init();
    
        lcd_set_direction(DIR_YX_RLDU);
    
        lcd_clear(BLACK);
        /* DVP init */
        printf("DVP init\n");
    
        printf("Found camera OV2640\n");
        dvp_init(8);
        dvp_set_xclk_rate(24000000);
        dvp_enable_burst();
        dvp_set_output_enable(0, 1);
        dvp_set_output_enable(1, 1);
        dvp_set_image_format(DVP_CFG_RGB_FORMAT);
        dvp_set_image_size(320, 240);
        ov2640_init();
    
        kpu_image.pixel = 3;
        kpu_image.width = 320;
        kpu_image.height = 240;
        image_init(&kpu_image);
        display_image.pixel = 3;
        display_image.width = 320;
        display_image.height = 240;
        image_init(&display_image);
    
        dvp_set_ai_addr((uint32_t)kpu_image.addr, (uint32_t)(kpu_image.addr + 320 * 240), (uint32_t)(kpu_image.addr + 320 * 240 * 2));
    
        dvp_set_display_addr((uint32_t)display_image.addr);
        dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 0);
        dvp_disable_auto();
        /* DVP interrupt config */
        printf("DVP interrupt config\n");
        plic_set_priority(IRQN_DVP_INTERRUPT, 1);
        plic_irq_register(IRQN_DVP_INTERRUPT, dvp_irq, NULL);
        plic_irq_enable(IRQN_DVP_INTERRUPT);
        /* init face detect model */
        if (kpu_load_kmodel(&net_task, model_data) != 0)
        {
            printf("\nmodel init error\n");
            while (1);
        }
        detect_rl.anchor_number = ANCHOR_NUM;
     //detect_rl.anchor = anchor;
        detect_rl.threshold = 0.7;
        detect_rl.nms_value = 0.3;
        //region_layer_init(&detect_rl, 20, 15, 125, 320, 240);
        /* enable global interrupt */
        sysctl_enable_irq();
        /* system start */
        printf("System start\n");
        while (1)
        {
            g_dvp_finish_flag = 0;
            dvp_clear_interrupt(DVP_STS_FRAME_START | DVP_STS_FRAME_FINISH);
            dvp_config_interrupt(DVP_CFG_START_INT_ENABLE | DVP_CFG_FINISH_INT_ENABLE, 1);
            while (g_dvp_finish_flag == 0)
                ;
            /* run face detect */
    
            g_ai_done_flag = 0;
            printf("Run model...\n");
            kpu_run_kmodel(&net_task, kpu_image.addr, DMAC_CHANNEL5, ai_done, NULL);
            printf("Waiting for AI to finish...\n");
            while(!g_ai_done_flag);
            printf("AI Done.\n");
           
            /* display picture */
            lcd_draw_picture(0, 0, 320, 240, (uint32_t *)display_image.addr);
        }
    }
    

    The detect.kmodel from the face_detect example works perfectly.
    However, when I try to use my own kmodel, the program get stuck at line 238 (g_ai_done_flag is never set to 1).

    What could be the source of the problem? Is there a workaround?

    Thank you in advance for your help!


  • |  Mod

    Can you try with the newest nncase and SDK?



  • @sunnycase No, the largest number of output channel is 300. My input dimension is 320x240x3, and the output is 20x15x125. After padding the input image, the largest input of a Conv2D layer is 322x242x3, and ncc transforms it to kmodel without problems.


  • |  Mod

    Is there any conv2d layer in your model has output features more than 1024 channels?



  • @latyas The program stays in the "while(!g_ai_done_flag)" loop at line 238. The ai_done callback (defined at line 47) is never called.


  • Staff |  Mod

    @sni What does the term "stuck" mean? Did the chip hang and can't contact with JTAG or just be in some loop?