diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index ac6b77af3ec4..1462c5fb73e5 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -50,14 +50,6 @@ static ncnn::VkAllocator* g_staging_vkallocator = 0;
 
 void benchmark(const char* comment, const std::vector<ncnn::Mat>& _in, const ncnn::Option& opt, const char* model_param_data = NULL)
 {
-    // Skip if int8 model name and using GPU
-    if (opt.use_vulkan_compute && strstr(comment, "int8") != NULL)
-    {
-        if (!model_param_data)
-            fprintf(stderr, "%20s  skipped (int8+GPU not supported)\n", comment);
-        return;
-    }
-
     g_blob_pool_allocator.clear();
     g_workspace_pool_allocator.clear();
 
diff --git a/benchmark/models/efficientnet_b0.param b/benchmark/models/efficientnet_b0.param
index 0a762d21cbee..c5014a11fe7d 100644
--- a/benchmark/models/efficientnet_b0.param
+++ b/benchmark/models/efficientnet_b0.param
@@ -9,7 +9,7 @@ Split                    splitncnn_0              1 2 368 368_splitncnn_0 368_sp
 Pooling                  GlobalAveragePool_8      1 1 368_splitncnn_1 369 -23330=4,1,32,1,1 0=1 4=1
 InnerProduct             Conv_9                   1 1 369 370 -23330=4,1,8,1,1 0=8 1=1 2=256
 Swish                    Mul_11                   1 1 370 372 -23330=4,1,8,1,1
-Convolution              Conv_12                  1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 5=1 6=256 9=4
+InnerProduct             Conv_12                  1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 2=256 9=4
 BinaryOp                 Mul_14                   2 1 368_splitncnn_0 374 375 -23330=4,3,112,112,32 0=2
 Convolution              Conv_15                  1 1 375 377 -23330=4,3,112,112,16 0=16 1=1 5=1 6=512
 Convolution              Conv_17                  1 1 377 379 -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536
@@ -20,7 +20,7 @@ Split                    splitncnn_1              1 2 385 385_splitncnn_0 385_sp
 Pooling                  GlobalAveragePool_25     1 1 385_splitncnn_1 386 -23330=4,1,96,1,1 0=1 4=1
 InnerProduct             Conv_26                  1 1 386 387 -23330=4,1,4,1,1 0=4 1=1 2=384
 Swish                    Mul_28                   1 1 387 389 -23330=4,1,4,1,1
-Convolution              Conv_29                  1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 5=1 6=384 9=4
+InnerProduct             Conv_29                  1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 2=384 9=4
 BinaryOp                 Mul_31                   2 1 385_splitncnn_0 391 392 -23330=4,3,56,56,96 0=2
 Convolution              Conv_32                  1 1 392 394 -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304
 Split                    splitncnn_2              1 2 394 394_splitncnn_0 394_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
@@ -32,7 +32,7 @@ Split                    splitncnn_3              1 2 402 402_splitncnn_0 402_sp
 Pooling                  GlobalAveragePool_42     1 1 402_splitncnn_1 403 -23330=4,1,144,1,1 0=1 4=1
 InnerProduct             Conv_43                  1 1 403 404 -23330=4,1,6,1,1 0=6 1=1 2=864
 Swish                    Mul_45                   1 1 404 406 -23330=4,1,6,1,1
-Convolution              Conv_46                  1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4
+InnerProduct             Conv_46                  1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 2=864 9=4
 BinaryOp                 Mul_48                   2 1 402_splitncnn_0 408 409 -23330=4,3,56,56,144 0=2
 Convolution              Conv_49                  1 1 409 411 -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456
 BinaryOp                 Add_51                   2 1 394_splitncnn_0 411 412 -23330=4,3,56,56,24
@@ -44,7 +44,7 @@ Split                    splitncnn_4              1 2 420 420_splitncnn_0 420_sp
 Pooling                  GlobalAveragePool_60     1 1 420_splitncnn_1 421 -23330=4,1,144,1,1 0=1 4=1
 InnerProduct             Conv_61                  1 1 421 422 -23330=4,1,6,1,1 0=6 1=1 2=864
 Swish                    Mul_63                   1 1 422 424 -23330=4,1,6,1,1
-Convolution              Conv_64                  1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4
+InnerProduct             Conv_64                  1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 2=864 9=4
 BinaryOp                 Mul_66                   2 1 420_splitncnn_0 426 427 -23330=4,3,28,28,144 0=2
 Convolution              Conv_67                  1 1 427 429 -23330=4,3,28,28,40 0=40 1=1 5=1 6=5760
 Split                    splitncnn_5              1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
@@ -56,7 +56,7 @@ Split                    splitncnn_6              1 2 437 437_splitncnn_0 437_sp
 Pooling                  GlobalAveragePool_77     1 1 437_splitncnn_1 438 -23330=4,1,240,1,1 0=1 4=1
 InnerProduct             Conv_78                  1 1 438 439 -23330=4,1,10,1,1 0=10 1=1 2=2400
 Swish                    Mul_80                   1 1 439 441 -23330=4,1,10,1,1
-Convolution              Conv_81                  1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4
+InnerProduct             Conv_81                  1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 2=2400 9=4
 BinaryOp                 Mul_83                   2 1 437_splitncnn_0 443 444 -23330=4,3,28,28,240 0=2
 Convolution              Conv_84                  1 1 444 446 -23330=4,3,28,28,40 0=40 1=1 5=1 6=9600
 BinaryOp                 Add_86                   2 1 429_splitncnn_0 446 447 -23330=4,3,28,28,40
@@ -68,7 +68,7 @@ Split                    splitncnn_7              1 2 455 455_splitncnn_0 455_sp
 Pooling                  GlobalAveragePool_95     1 1 455_splitncnn_1 456 -23330=4,1,240,1,1 0=1 4=1
 InnerProduct             Conv_96                  1 1 456 457 -23330=4,1,10,1,1 0=10 1=1 2=2400
 Swish                    Mul_98                   1 1 457 459 -23330=4,1,10,1,1
-Convolution              Conv_99                  1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4
+InnerProduct             Conv_99                  1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 2=2400 9=4
 BinaryOp                 Mul_101                  2 1 455_splitncnn_0 461 462 -23330=4,3,14,14,240 0=2
 Convolution              Conv_102                 1 1 462 464 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
 Split                    splitncnn_8              1 2 464 464_splitncnn_0 464_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
@@ -80,7 +80,7 @@ Split                    splitncnn_9              1 2 472 472_splitncnn_0 472_sp
 Pooling                  GlobalAveragePool_112    1 1 472_splitncnn_1 473 -23330=4,1,480,1,1 0=1 4=1
 InnerProduct             Conv_113                 1 1 473 474 -23330=4,1,20,1,1 0=20 1=1 2=9600
 Swish                    Mul_115                  1 1 474 476 -23330=4,1,20,1,1
-Convolution              Conv_116                 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
+InnerProduct             Conv_116                 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4
 BinaryOp                 Mul_118                  2 1 472_splitncnn_0 478 479 -23330=4,3,14,14,480 0=2
 Convolution              Conv_119                 1 1 479 481 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
 BinaryOp                 Add_121                  2 1 464_splitncnn_0 481 482 -23330=4,3,14,14,80
@@ -93,7 +93,7 @@ Split                    splitncnn_11             1 2 490 490_splitncnn_0 490_sp
 Pooling                  GlobalAveragePool_130    1 1 490_splitncnn_1 491 -23330=4,1,480,1,1 0=1 4=1
 InnerProduct             Conv_131                 1 1 491 492 -23330=4,1,20,1,1 0=20 1=1 2=9600
 Swish                    Mul_133                  1 1 492 494 -23330=4,1,20,1,1
-Convolution              Conv_134                 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
+InnerProduct             Conv_134                 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4
 BinaryOp                 Mul_136                  2 1 490_splitncnn_0 496 497 -23330=4,3,14,14,480 0=2
 Convolution              Conv_137                 1 1 497 499 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
 BinaryOp                 Add_139                  2 1 482_splitncnn_0 499 500 -23330=4,3,14,14,80
@@ -105,7 +105,7 @@ Split                    splitncnn_12             1 2 508 508_splitncnn_0 508_sp
 Pooling                  GlobalAveragePool_148    1 1 508_splitncnn_1 509 -23330=4,1,480,1,1 0=1 4=1
 InnerProduct             Conv_149                 1 1 509 510 -23330=4,1,20,1,1 0=20 1=1 2=9600
 Swish                    Mul_151                  1 1 510 512 -23330=4,1,20,1,1
-Convolution              Conv_152                 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
+InnerProduct             Conv_152                 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 2=9600 9=4
 BinaryOp                 Mul_154                  2 1 508_splitncnn_0 514 515 -23330=4,3,14,14,480 0=2
 Convolution              Conv_155                 1 1 515 517 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
 Split                    splitncnn_13             1 2 517 517_splitncnn_0 517_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
@@ -117,7 +117,7 @@ Split                    splitncnn_14             1 2 525 525_splitncnn_0 525_sp
 Pooling                  GlobalAveragePool_165    1 1 525_splitncnn_1 526 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             Conv_166                 1 1 526 527 -23330=4,1,28,1,1 0=28 1=1 2=18816
 Swish                    Mul_168                  1 1 527 529 -23330=4,1,28,1,1
-Convolution              Conv_169                 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
+InnerProduct             Conv_169                 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4
 BinaryOp                 Mul_171                  2 1 525_splitncnn_0 531 532 -23330=4,3,14,14,672 0=2
 Convolution              Conv_172                 1 1 532 534 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
 BinaryOp                 Add_174                  2 1 517_splitncnn_0 534 535 -23330=4,3,14,14,112
@@ -130,7 +130,7 @@ Split                    splitncnn_16             1 2 543 543_splitncnn_0 543_sp
 Pooling                  GlobalAveragePool_183    1 1 543_splitncnn_1 544 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             Conv_184                 1 1 544 545 -23330=4,1,28,1,1 0=28 1=1 2=18816
 Swish                    Mul_186                  1 1 545 547 -23330=4,1,28,1,1
-Convolution              Conv_187                 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
+InnerProduct             Conv_187                 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4
 BinaryOp                 Mul_189                  2 1 543_splitncnn_0 549 550 -23330=4,3,14,14,672 0=2
 Convolution              Conv_190                 1 1 550 552 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
 BinaryOp                 Add_192                  2 1 535_splitncnn_0 552 553 -23330=4,3,14,14,112
@@ -142,7 +142,7 @@ Split                    splitncnn_17             1 2 561 561_splitncnn_0 561_sp
 Pooling                  GlobalAveragePool_201    1 1 561_splitncnn_1 562 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             Conv_202                 1 1 562 563 -23330=4,1,28,1,1 0=28 1=1 2=18816
 Swish                    Mul_204                  1 1 563 565 -23330=4,1,28,1,1
-Convolution              Conv_205                 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
+InnerProduct             Conv_205                 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 2=18816 9=4
 BinaryOp                 Mul_207                  2 1 561_splitncnn_0 567 568 -23330=4,3,7,7,672 0=2
 Convolution              Conv_208                 1 1 568 570 -23330=4,3,7,7,192 0=192 1=1 5=1 6=129024
 Split                    splitncnn_18             1 2 570 570_splitncnn_0 570_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
@@ -154,7 +154,7 @@ Split                    splitncnn_19             1 2 578 578_splitncnn_0 578_sp
 Pooling                  GlobalAveragePool_218    1 1 578_splitncnn_1 579 -23330=4,1,1152,1,1 0=1 4=1
 InnerProduct             Conv_219                 1 1 579 580 -23330=4,1,48,1,1 0=48 1=1 2=55296
 Swish                    Mul_221                  1 1 580 582 -23330=4,1,48,1,1
-Convolution              Conv_222                 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
+InnerProduct             Conv_222                 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4
 BinaryOp                 Mul_224                  2 1 578_splitncnn_0 584 585 -23330=4,3,7,7,1152 0=2
 Convolution              Conv_225                 1 1 585 587 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
 BinaryOp                 Add_227                  2 1 570_splitncnn_0 587 588 -23330=4,3,7,7,192
@@ -167,7 +167,7 @@ Split                    splitncnn_21             1 2 596 596_splitncnn_0 596_sp
 Pooling                  GlobalAveragePool_236    1 1 596_splitncnn_1 597 -23330=4,1,1152,1,1 0=1 4=1
 InnerProduct             Conv_237                 1 1 597 598 -23330=4,1,48,1,1 0=48 1=1 2=55296
 Swish                    Mul_239                  1 1 598 600 -23330=4,1,48,1,1
-Convolution              Conv_240                 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
+InnerProduct             Conv_240                 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4
 BinaryOp                 Mul_242                  2 1 596_splitncnn_0 602 603 -23330=4,3,7,7,1152 0=2
 Convolution              Conv_243                 1 1 603 605 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
 BinaryOp                 Add_245                  2 1 588_splitncnn_0 605 606 -23330=4,3,7,7,192
@@ -180,7 +180,7 @@ Split                    splitncnn_23             1 2 614 614_splitncnn_0 614_sp
 Pooling                  GlobalAveragePool_254    1 1 614_splitncnn_1 615 -23330=4,1,1152,1,1 0=1 4=1
 InnerProduct             Conv_255                 1 1 615 616 -23330=4,1,48,1,1 0=48 1=1 2=55296
 Swish                    Mul_257                  1 1 616 618 -23330=4,1,48,1,1
-Convolution              Conv_258                 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
+InnerProduct             Conv_258                 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4
 BinaryOp                 Mul_260                  2 1 614_splitncnn_0 620 621 -23330=4,3,7,7,1152 0=2
 Convolution              Conv_261                 1 1 621 623 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
 BinaryOp                 Add_263                  2 1 606_splitncnn_0 623 624 -23330=4,3,7,7,192
@@ -192,7 +192,7 @@ Split                    splitncnn_24             1 2 632 632_splitncnn_0 632_sp
 Pooling                  GlobalAveragePool_272    1 1 632_splitncnn_1 633 -23330=4,1,1152,1,1 0=1 4=1
 InnerProduct             Conv_273                 1 1 633 634 -23330=4,1,48,1,1 0=48 1=1 2=55296
 Swish                    Mul_275                  1 1 634 636 -23330=4,1,48,1,1
-Convolution              Conv_276                 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
+InnerProduct             Conv_276                 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 2=55296 9=4
 BinaryOp                 Mul_278                  2 1 632_splitncnn_0 638 639 -23330=4,3,7,7,1152 0=2
 Convolution              Conv_279                 1 1 639 641 -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640
 Convolution              Conv_281                 1 1 641 643 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600
diff --git a/benchmark/models/googlenet_int8.param b/benchmark/models/googlenet_int8.param
index baf13855d353..5ed1575ac6d6 100644
--- a/benchmark/models/googlenet_int8.param
+++ b/benchmark/models/googlenet_int8.param
@@ -1,96 +1,96 @@
 7767517
 94 121
-Input                    data                     0 1 data 0=224 1=224 2=3
-Convolution              conv1/7x7_s2             1 1 data conv1/7x7_s2_conv1/relu_7x7 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
-Pooling                  pool1/3x3_s2             1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 1=3 2=2
-LRN                      pool1/norm1              1 1 pool1/3x3_s2 pool1/norm1 2=0.000100
-Convolution              conv2/3x3_reduce         1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce 0=64 1=1 5=1 6=4096 8=102 9=1
-Convolution              conv2/3x3                1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 0=192 1=3 4=1 5=1 6=110592 8=2 9=1
-LRN                      conv2/norm2              1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 2=0.000100
-Pooling                  pool2/3x3_s2             1 1 conv2/norm2 pool2/3x3_s2 1=3 2=2
-Split                    splitncnn_0              1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3
-Convolution              inception_3a/1x1         1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 0=64 1=1 5=1 6=12288 8=2 9=1
-Convolution              inception_3a/3x3_reduce  1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce 0=96 1=1 5=1 6=18432 8=102 9=1
-Convolution              inception_3a/3x3         1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 0=128 1=3 4=1 5=1 6=110592 8=2 9=1
-Convolution              inception_3a/5x5_reduce  1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce 0=16 1=1 5=1 6=3072 8=102 9=1
-Convolution              inception_3a/5x5         1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 0=32 1=5 4=2 5=1 6=12800 8=2 9=1
-Pooling                  inception_3a/pool        1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 1=3 3=1
-Convolution              inception_3a/pool_proj   1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj 0=32 1=1 5=1 6=6144 8=2 9=1
-Concat                   inception_3a/output      4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output
-Split                    splitncnn_1              1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3
-Convolution              inception_3b/1x1         1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 0=128 1=1 5=1 6=32768 8=2 9=1
-Convolution              inception_3b/3x3_reduce  1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce 0=128 1=1 5=1 6=32768 8=102 9=1
-Convolution              inception_3b/3x3         1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 0=192 1=3 4=1 5=1 6=221184 8=2 9=1
-Convolution              inception_3b/5x5_reduce  1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce 0=32 1=1 5=1 6=8192 8=102 9=1
-Convolution              inception_3b/5x5         1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 0=96 1=5 4=2 5=1 6=76800 8=2 9=1
-Pooling                  inception_3b/pool        1 1 inception_3a/output_splitncnn_0 inception_3b/pool 1=3 3=1
-Convolution              inception_3b/pool_proj   1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj 0=64 1=1 5=1 6=16384 8=2 9=1
-Concat                   inception_3b/output      4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output
-Pooling                  pool3/3x3_s2             1 1 inception_3b/output pool3/3x3_s2 1=3 2=2
-Split                    splitncnn_2              1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3
-Convolution              inception_4a/1x1         1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 0=192 1=1 5=1 6=92160 8=2 9=1
-Convolution              inception_4a/3x3_reduce  1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce 0=96 1=1 5=1 6=46080 8=102 9=1
-Convolution              inception_4a/3x3         1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 0=208 1=3 4=1 5=1 6=179712 8=2 9=1
-Convolution              inception_4a/5x5_reduce  1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce 0=16 1=1 5=1 6=7680 8=102 9=1
-Convolution              inception_4a/5x5         1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 0=48 1=5 4=2 5=1 6=19200 8=2 9=1
-Pooling                  inception_4a/pool        1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 1=3 3=1
-Convolution              inception_4a/pool_proj   1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj 0=64 1=1 5=1 6=30720 8=2 9=1
-Concat                   inception_4a/output      4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output
-Split                    splitncnn_3              1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3
-Convolution              inception_4b/1x1         1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 0=160 1=1 5=1 6=81920 8=2 9=1
-Convolution              inception_4b/3x3_reduce  1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce 0=112 1=1 5=1 6=57344 8=102 9=1
-Convolution              inception_4b/3x3         1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 0=224 1=3 4=1 5=1 6=225792 8=2 9=1
-Convolution              inception_4b/5x5_reduce  1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1
-Convolution              inception_4b/5x5         1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
-Pooling                  inception_4b/pool        1 1 inception_4a/output_splitncnn_0 inception_4b/pool 1=3 3=1
-Convolution              inception_4b/pool_proj   1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
-Concat                   inception_4b/output      4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output
-Split                    splitncnn_4              1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3
-Convolution              inception_4c/1x1         1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 0=128 1=1 5=1 6=65536 8=2 9=1
-Convolution              inception_4c/3x3_reduce  1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce 0=128 1=1 5=1 6=65536 8=102 9=1
-Convolution              inception_4c/3x3         1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 0=256 1=3 4=1 5=1 6=294912 8=2 9=1
-Convolution              inception_4c/5x5_reduce  1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1
-Convolution              inception_4c/5x5         1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
-Pooling                  inception_4c/pool        1 1 inception_4b/output_splitncnn_0 inception_4c/pool 1=3 3=1
-Convolution              inception_4c/pool_proj   1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
-Concat                   inception_4c/output      4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output
-Split                    splitncnn_5              1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3
-Convolution              inception_4d/1x1         1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 0=112 1=1 5=1 6=57344 8=2 9=1
-Convolution              inception_4d/3x3_reduce  1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce 0=144 1=1 5=1 6=73728 8=102 9=1
-Convolution              inception_4d/3x3         1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 0=288 1=3 4=1 5=1 6=373248 8=2 9=1
-Convolution              inception_4d/5x5_reduce  1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce 0=32 1=1 5=1 6=16384 8=102 9=1
-Convolution              inception_4d/5x5         1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 0=64 1=5 4=2 5=1 6=51200 8=2 9=1
-Pooling                  inception_4d/pool        1 1 inception_4c/output_splitncnn_0 inception_4d/pool 1=3 3=1
-Convolution              inception_4d/pool_proj   1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
-Concat                   inception_4d/output      4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output
-Split                    splitncnn_6              1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3
-Convolution              inception_4e/1x1         1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 0=256 1=1 5=1 6=135168 8=2 9=1
-Convolution              inception_4e/3x3_reduce  1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce 0=160 1=1 5=1 6=84480 8=102 9=1
-Convolution              inception_4e/3x3         1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
-Convolution              inception_4e/5x5_reduce  1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce 0=32 1=1 5=1 6=16896 8=102 9=1
-Convolution              inception_4e/5x5         1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
-Pooling                  inception_4e/pool        1 1 inception_4d/output_splitncnn_0 inception_4e/pool 1=3 3=1
-Convolution              inception_4e/pool_proj   1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj 0=128 1=1 5=1 6=67584 8=2 9=1
-Concat                   inception_4e/output      4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output
-Pooling                  pool4/3x3_s2             1 1 inception_4e/output pool4/3x3_s2 1=3 2=2
-Split                    splitncnn_7              1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3
-Convolution              inception_5a/1x1         1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 0=256 1=1 5=1 6=212992 8=2 9=1
-Convolution              inception_5a/3x3_reduce  1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce 0=160 1=1 5=1 6=133120 8=102 9=1
-Convolution              inception_5a/3x3         1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
-Convolution              inception_5a/5x5_reduce  1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce 0=32 1=1 5=1 6=26624 8=102 9=1
-Convolution              inception_5a/5x5         1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
-Pooling                  inception_5a/pool        1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 1=3 3=1
-Convolution              inception_5a/pool_proj   1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1
-Concat                   inception_5a/output      4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output
-Split                    splitncnn_8              1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3
-Convolution              inception_5b/1x1         1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 0=384 1=1 5=1 6=319488 8=2 9=1
-Convolution              inception_5b/3x3_reduce  1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce 0=192 1=1 5=1 6=159744 8=102 9=1
-Convolution              inception_5b/3x3         1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 0=384 1=3 4=1 5=1 6=663552 8=2 9=1
-Convolution              inception_5b/5x5_reduce  1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce 0=48 1=1 5=1 6=39936 8=102 9=1
-Convolution              inception_5b/5x5         1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 0=128 1=5 4=2 5=1 6=153600 8=2 9=1
-Pooling                  inception_5b/pool        1 1 inception_5a/output_splitncnn_0 inception_5b/pool 1=3 3=1
-Convolution              inception_5b/pool_proj   1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1
-Concat                   inception_5b/output      4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output
-Pooling                  pool5/7x7_s1             1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 0=1 1=7
-InnerProduct             loss3/classifier         1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000
-Softmax                  prob                     1 1 loss3/classifier output
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              conv1/7x7_s2             1 1 data conv1/7x7_s2_conv1/relu_7x7 -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
+Pooling                  pool1/3x3_s2             1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 -23330=4,3,56,56,64 1=3 2=2
+LRN                      pool1/norm1              1 1 pool1/3x3_s2 pool1/norm1 -23330=4,3,56,56,64 2=0.000100
+Convolution              conv2/3x3_reduce         1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=102 9=1
+Convolution              conv2/3x3                1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 -23330=4,3,56,56,192 0=192 1=3 4=1 5=1 6=110592 8=2 9=1
+LRN                      conv2/norm2              1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 -23330=4,3,56,56,192 2=0.000100
+Pooling                  pool2/3x3_s2             1 1 conv2/norm2 pool2/3x3_s2 -23330=4,3,28,28,192 1=3 2=2
+Split                    splitncnn_0              1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 -23330=16,3,28,28,192,3,28,28,192,3,28,28,192,3,28,28,192
+Convolution              inception_3a/1x1         1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 -23330=4,3,28,28,64 0=64 1=1 5=1 6=12288 8=2 9=1
+Convolution              inception_3a/3x3_reduce  1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce -23330=4,3,28,28,96 0=96 1=1 5=1 6=18432 8=102 9=1
+Convolution              inception_3a/3x3         1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=110592 8=2 9=1
+Convolution              inception_3a/5x5_reduce  1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce -23330=4,3,28,28,16 0=16 1=1 5=1 6=3072 8=102 9=1
+Convolution              inception_3a/5x5         1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 -23330=4,3,28,28,32 0=32 1=5 4=2 5=1 6=12800 8=2 9=1
+Pooling                  inception_3a/pool        1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool -23330=4,3,28,28,192 1=3 3=1
+Convolution              inception_3a/pool_proj   1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 8=2 9=1
+Concat                   inception_3a/output      4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output -23330=4,3,28,28,256
+Split                    splitncnn_1              1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 -23330=16,3,28,28,256,3,28,28,256,3,28,28,256,3,28,28,256
+Convolution              inception_3b/1x1         1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 8=2 9=1
+Convolution              inception_3b/3x3_reduce  1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 8=102 9=1
+Convolution              inception_3b/3x3         1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=221184 8=2 9=1
+Convolution              inception_3b/5x5_reduce  1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 8=102 9=1
+Convolution              inception_3b/5x5         1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 -23330=4,3,28,28,96 0=96 1=5 4=2 5=1 6=76800 8=2 9=1
+Pooling                  inception_3b/pool        1 1 inception_3a/output_splitncnn_0 inception_3b/pool -23330=4,3,28,28,256 1=3 3=1
+Convolution              inception_3b/pool_proj   1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj -23330=4,3,28,28,64 0=64 1=1 5=1 6=16384 8=2 9=1
+Concat                   inception_3b/output      4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output -23330=4,3,28,28,480
+Pooling                  pool3/3x3_s2             1 1 inception_3b/output pool3/3x3_s2 -23330=4,3,14,14,480 1=3 2=2
+Split                    splitncnn_2              1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 -23330=16,3,14,14,480,3,14,14,480,3,14,14,480,3,14,14,480
+Convolution              inception_4a/1x1         1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=92160 8=2 9=1
+Convolution              inception_4a/3x3_reduce  1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 8=102 9=1
+Convolution              inception_4a/3x3         1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=179712 8=2 9=1
+Convolution              inception_4a/5x5_reduce  1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce -23330=4,3,14,14,16 0=16 1=1 5=1 6=7680 8=102 9=1
+Convolution              inception_4a/5x5         1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 -23330=4,3,14,14,48 0=48 1=5 4=2 5=1 6=19200 8=2 9=1
+Pooling                  inception_4a/pool        1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool -23330=4,3,14,14,480 1=3 3=1
+Convolution              inception_4a/pool_proj   1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=30720 8=2 9=1
+Concat                   inception_4a/output      4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output -23330=4,3,14,14,512
+Split                    splitncnn_3              1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
+Convolution              inception_4b/1x1         1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 -23330=4,3,14,14,160 0=160 1=1 5=1 6=81920 8=2 9=1
+Convolution              inception_4b/3x3_reduce  1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 8=102 9=1
+Convolution              inception_4b/3x3         1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 -23330=4,3,14,14,224 0=224 1=3 4=1 5=1 6=225792 8=2 9=1
+Convolution              inception_4b/5x5_reduce  1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 8=102 9=1
+Convolution              inception_4b/5x5         1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
+Pooling                  inception_4b/pool        1 1 inception_4a/output_splitncnn_0 inception_4b/pool -23330=4,3,14,14,512 1=3 3=1
+Convolution              inception_4b/pool_proj   1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1
+Concat                   inception_4b/output      4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output -23330=4,3,14,14,512
+Split                    splitncnn_4              1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
+Convolution              inception_4c/1x1         1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 8=2 9=1
+Convolution              inception_4c/3x3_reduce  1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 8=102 9=1
+Convolution              inception_4c/3x3         1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=294912 8=2 9=1
+Convolution              inception_4c/5x5_reduce  1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 8=102 9=1
+Convolution              inception_4c/5x5         1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
+Pooling                  inception_4c/pool        1 1 inception_4b/output_splitncnn_0 inception_4c/pool -23330=4,3,14,14,512 1=3 3=1
+Convolution              inception_4c/pool_proj   1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1
+Concat                   inception_4c/output      4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output -23330=4,3,14,14,512
+Split                    splitncnn_5              1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
+Convolution              inception_4d/1x1         1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 8=2 9=1
+Convolution              inception_4d/3x3_reduce  1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce -23330=4,3,14,14,144 0=144 1=1 5=1 6=73728 8=102 9=1
+Convolution              inception_4d/3x3         1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 -23330=4,3,14,14,288 0=288 1=3 4=1 5=1 6=373248 8=2 9=1
+Convolution              inception_4d/5x5_reduce  1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16384 8=102 9=1
+Convolution              inception_4d/5x5         1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=51200 8=2 9=1
+Pooling                  inception_4d/pool        1 1 inception_4c/output_splitncnn_0 inception_4d/pool -23330=4,3,14,14,512 1=3 3=1
+Convolution              inception_4d/pool_proj   1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=2 9=1
+Concat                   inception_4d/output      4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output -23330=4,3,14,14,528
+Split                    splitncnn_6              1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 -23330=16,3,14,14,528,3,14,14,528,3,14,14,528,3,14,14,528
+Convolution              inception_4e/1x1         1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=135168 8=2 9=1
+Convolution              inception_4e/3x3_reduce  1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce -23330=4,3,14,14,160 0=160 1=1 5=1 6=84480 8=102 9=1
+Convolution              inception_4e/3x3         1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 -23330=4,3,14,14,320 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
+Convolution              inception_4e/5x5_reduce  1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16896 8=102 9=1
+Convolution              inception_4e/5x5         1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 -23330=4,3,14,14,128 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
+Pooling                  inception_4e/pool        1 1 inception_4d/output_splitncnn_0 inception_4e/pool -23330=4,3,14,14,528 1=3 3=1
+Convolution              inception_4e/pool_proj   1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj -23330=4,3,14,14,128 0=128 1=1 5=1 6=67584 8=2 9=1
+Concat                   inception_4e/output      4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output -23330=4,3,14,14,832
+Pooling                  pool4/3x3_s2             1 1 inception_4e/output pool4/3x3_s2 -23330=4,3,7,7,832 1=3 2=2
+Split                    splitncnn_7              1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832
+Convolution              inception_5a/1x1         1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 -23330=4,3,7,7,256 0=256 1=1 5=1 6=212992 8=2 9=1
+Convolution              inception_5a/3x3_reduce  1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce -23330=4,3,7,7,160 0=160 1=1 5=1 6=133120 8=102 9=1
+Convolution              inception_5a/3x3         1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 -23330=4,3,7,7,320 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
+Convolution              inception_5a/5x5_reduce  1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce -23330=4,3,7,7,32 0=32 1=1 5=1 6=26624 8=102 9=1
+Convolution              inception_5a/5x5         1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
+Pooling                  inception_5a/pool        1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool -23330=4,3,7,7,832 1=3 3=1
+Convolution              inception_5a/pool_proj   1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 8=2 9=1
+Concat                   inception_5a/output      4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output -23330=4,3,7,7,832
+Split                    splitncnn_8              1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832
+Convolution              inception_5b/1x1         1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 -23330=4,3,7,7,384 0=384 1=1 5=1 6=319488 8=2 9=1
+Convolution              inception_5b/3x3_reduce  1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce -23330=4,3,7,7,192 0=192 1=1 5=1 6=159744 8=102 9=1
+Convolution              inception_5b/3x3         1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 -23330=4,3,7,7,384 0=384 1=3 4=1 5=1 6=663552 8=2 9=1
+Convolution              inception_5b/5x5_reduce  1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce -23330=4,3,7,7,48 0=48 1=1 5=1 6=39936 8=102 9=1
+Convolution              inception_5b/5x5         1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=153600 8=2 9=1
+Pooling                  inception_5b/pool        1 1 inception_5a/output_splitncnn_0 inception_5b/pool -23330=4,3,7,7,832 1=3 3=1
+Convolution              inception_5b/pool_proj   1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 8=2 9=1
+Concat                   inception_5b/output      4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output -23330=4,3,7,7,1024
+Pooling                  pool5/7x7_s1             1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 -23330=4,3,1,1,1024 0=1 1=7
+InnerProduct             loss3/classifier         1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier -23330=4,1,1000,1,1 0=1000 1=1 2=1024000
+Softmax                  prob                     1 1 loss3/classifier output -23330=4,1,1000,1,1
diff --git a/benchmark/models/mobilenet_int8.param b/benchmark/models/mobilenet_int8.param
index 12c63e0f5e30..f5a078b2d2d6 100644
--- a/benchmark/models/mobilenet_int8.param
+++ b/benchmark/models/mobilenet_int8.param
@@ -1,33 +1,33 @@
 7767517
 31 31
-Input                    data                     0 1 data 0=224 1=224 2=3
-Convolution              conv1                    1 1 data conv1_relu1 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
-ConvolutionDepthWise     conv2_1/dw               1 1 conv1_relu1 conv2_1/dw_relu2_1/dw 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
-Convolution              conv2_1/sep              1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep 0=64 1=1 5=1 6=2048 8=102 9=1
-ConvolutionDepthWise     conv2_2/dw               1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
-Convolution              conv2_2/sep              1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep 0=128 1=1 5=1 6=8192 8=102 9=1
-ConvolutionDepthWise     conv3_1/dw               1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
-Convolution              conv3_1/sep              1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep 0=128 1=1 5=1 6=16384 8=102 9=1
-ConvolutionDepthWise     conv3_2/dw               1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
-Convolution              conv3_2/sep              1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep 0=256 1=1 5=1 6=32768 8=102 9=1
-ConvolutionDepthWise     conv4_1/dw               1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
-Convolution              conv4_1/sep              1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep 0=256 1=1 5=1 6=65536 8=102 9=1
-ConvolutionDepthWise     conv4_2/dw               1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
-Convolution              conv4_2/sep              1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep 0=512 1=1 5=1 6=131072 8=102 9=1
-ConvolutionDepthWise     conv5_1/dw               1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_1/sep              1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv5_2/dw               1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_2/sep              1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv5_3/dw               1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_3/sep              1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv5_4/dw               1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_4/sep              1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv5_5/dw               1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_5/sep              1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv5_6/dw               1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv5_6/sep              1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep 0=1024 1=1 5=1 6=524288 8=102 9=1
-ConvolutionDepthWise     conv6/dw                 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
-Convolution              conv6/sep                1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep 0=1024 1=1 5=1 6=1048576 8=2 9=1
-Pooling                  pool6                    1 1 conv6/sep_relu6/sep pool6 0=1 4=1
-InnerProduct             fc7                      1 1 pool6 fc7 0=1000 1=1 2=1024000 8=2
-Softmax                  prob                     1 1 fc7 output
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              conv1                    1 1 data conv1_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
+ConvolutionDepthWise     conv2_1/dw               1 1 conv1_relu1 conv2_1/dw_relu2_1/dw -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
+Convolution              conv2_1/sep              1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep -23330=4,3,112,112,64 0=64 1=1 5=1 6=2048 8=102 9=1
+ConvolutionDepthWise     conv2_2/dw               1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
+Convolution              conv2_2/sep              1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=8192 8=102 9=1
+ConvolutionDepthWise     conv3_1/dw               1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw -23330=4,3,56,56,128 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
+Convolution              conv3_1/sep              1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=16384 8=102 9=1
+ConvolutionDepthWise     conv3_2/dw               1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
+Convolution              conv3_2/sep              1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=32768 8=102 9=1
+ConvolutionDepthWise     conv4_1/dw               1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw -23330=4,3,28,28,256 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
+Convolution              conv4_1/sep              1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=65536 8=102 9=1
+ConvolutionDepthWise     conv4_2/dw               1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
+Convolution              conv4_2/sep              1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=131072 8=102 9=1
+ConvolutionDepthWise     conv5_1/dw               1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_1/sep              1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv5_2/dw               1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_2/sep              1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv5_3/dw               1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_3/sep              1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv5_4/dw               1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_4/sep              1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv5_5/dw               1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_5/sep              1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv5_6/dw               1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv5_6/sep              1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=524288 8=102 9=1
+ConvolutionDepthWise     conv6/dw                 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw -23330=4,3,7,7,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
+Convolution              conv6/sep                1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=1048576 8=2 9=1
+Pooling                  pool6                    1 1 conv6/sep_relu6/sep pool6 -23330=4,1,1024,1,1 0=1 4=1
+InnerProduct             fc7                      1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000 8=2
+Softmax                  prob                     1 1 fc7 output -23330=4,1,1000,1,1
diff --git a/benchmark/models/mobilenet_ssd_int8.param b/benchmark/models/mobilenet_ssd_int8.param
index 3b38cf04ad70..90460ef8decf 100644
--- a/benchmark/models/mobilenet_ssd_int8.param
+++ b/benchmark/models/mobilenet_ssd_int8.param
@@ -1,94 +1,94 @@
 7767517
 92 115
-Input                    input                    0 1 data 0=300 1=300 2=3
-Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
-Convolution              conv0                    1 1 data_splitncnn_6 conv0_conv0/relu 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
-ConvolutionDepthWise     conv1/dw                 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
-Convolution              conv1                    1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu 0=64 1=1 5=1 6=2048 8=102 9=1
-ConvolutionDepthWise     conv2/dw                 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
-Convolution              conv2                    1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu 0=128 1=1 5=1 6=8192 8=102 9=1
-ConvolutionDepthWise     conv3/dw                 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
-Convolution              conv3                    1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu 0=128 1=1 5=1 6=16384 8=102 9=1
-ConvolutionDepthWise     conv4/dw                 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
-Convolution              conv4                    1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu 0=256 1=1 5=1 6=32768 8=102 9=1
-ConvolutionDepthWise     conv5/dw                 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
-Convolution              conv5                    1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu 0=256 1=1 5=1 6=65536 8=102 9=1
-ConvolutionDepthWise     conv6/dw                 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
-Convolution              conv6                    1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu 0=512 1=1 5=1 6=131072 8=102 9=1
-ConvolutionDepthWise     conv7/dw                 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv7                    1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv8/dw                 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv8                    1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv9/dw                 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv9                    1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv10/dw                1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv10                   1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu 0=512 1=1 5=1 6=262144 8=102 9=1
-ConvolutionDepthWise     conv11/dw                1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv11                   1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu 0=512 1=1 5=1 6=262144 8=2 9=1
-Split                    splitncnn_1              1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3
-ConvolutionDepthWise     conv12/dw                1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
-Convolution              conv12                   1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu 0=1024 1=1 5=1 6=524288 8=102 9=1
-ConvolutionDepthWise     conv13/dw                1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
-Convolution              conv13                   1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu 0=1024 1=1 5=1 6=1048576 8=2 9=1
-Split                    splitncnn_2              1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3
-Convolution              conv14_1                 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              conv14_2                 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1
-Split                    splitncnn_3              1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3
-Convolution              conv15_1                 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu 0=128 1=1 5=1 6=65536 8=102 9=1
-Convolution              conv15_2                 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
-Split                    splitncnn_4              1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3
-Convolution              conv16_1                 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu 0=128 1=1 5=1 6=32768 8=102 9=1
-Convolution              conv16_2                 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
-Split                    splitncnn_5              1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3
-Convolution              conv17_1                 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1
-Convolution              conv17_2                 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
-Split                    splitncnn_6              1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2
-Convolution              conv11_mbox_loc          1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 5=1 6=6144 8=2
-Permute                  conv11_mbox_loc_perm     1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3
-Flatten                  conv11_mbox_loc_flat     1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat
-Convolution              conv11_mbox_conf         1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 5=1 6=32256 8=2
-Permute                  conv11_mbox_conf_perm    1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3
-Flatten                  conv11_mbox_conf_flat    1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat
-PriorBox                 conv11_mbox_priorbox     2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000
-Convolution              conv13_mbox_loc          1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 5=1 6=24576 8=2
-Permute                  conv13_mbox_loc_perm     1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3
-Flatten                  conv13_mbox_loc_flat     1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat
-Convolution              conv13_mbox_conf         1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 5=1 6=129024 8=2
-Permute                  conv13_mbox_conf_perm    1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3
-Flatten                  conv13_mbox_conf_flat    1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat
-PriorBox                 conv13_mbox_priorbox     2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
-Convolution              conv14_2_mbox_loc        1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 5=1 6=12288 8=2
-Permute                  conv14_2_mbox_loc_perm   1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3
-Flatten                  conv14_2_mbox_loc_flat   1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat
-Convolution              conv14_2_mbox_conf       1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 5=1 6=64512 8=2
-Permute                  conv14_2_mbox_conf_perm  1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3
-Flatten                  conv14_2_mbox_conf_flat  1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat
-PriorBox                 conv14_2_mbox_priorbox   2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
-Convolution              conv15_2_mbox_loc        1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2
-Permute                  conv15_2_mbox_loc_perm   1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3
-Flatten                  conv15_2_mbox_loc_flat   1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat
-Convolution              conv15_2_mbox_conf       1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2
-Permute                  conv15_2_mbox_conf_perm  1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3
-Flatten                  conv15_2_mbox_conf_flat  1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat
-PriorBox                 conv15_2_mbox_priorbox   2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
-Convolution              conv16_2_mbox_loc        1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2
-Permute                  conv16_2_mbox_loc_perm   1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3
-Flatten                  conv16_2_mbox_loc_flat   1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat
-Convolution              conv16_2_mbox_conf       1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2
-Permute                  conv16_2_mbox_conf_perm  1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3
-Flatten                  conv16_2_mbox_conf_flat  1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat
-PriorBox                 conv16_2_mbox_priorbox   2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
-Convolution              conv17_2_mbox_loc        1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 5=1 6=3072 8=2
-Permute                  conv17_2_mbox_loc_perm   1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3
-Flatten                  conv17_2_mbox_loc_flat   1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat
-Convolution              conv17_2_mbox_conf       1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 5=1 6=16128 8=2
-Permute                  conv17_2_mbox_conf_perm  1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3
-Flatten                  conv17_2_mbox_conf_flat  1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat
-PriorBox                 conv17_2_mbox_priorbox   2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
-Concat                   mbox_loc                 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc
-Concat                   mbox_conf                6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf
-Concat                   mbox_priorbox            6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1
-Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape 0=21 1=-1
-Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1
-Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten
+Input                    input                    0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3
+Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3
+Convolution              conv0                    1 1 data_splitncnn_6 conv0_conv0/relu -23330=4,3,150,150,32 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
+ConvolutionDepthWise     conv1/dw                 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,150,150,32 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
+Convolution              conv1                    1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,150,150,64 0=64 1=1 5=1 6=2048 8=102 9=1
+ConvolutionDepthWise     conv2/dw                 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,75,75,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
+Convolution              conv2                    1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=8192 8=102 9=1
+ConvolutionDepthWise     conv3/dw                 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,75,75,128 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
+Convolution              conv3                    1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=16384 8=102 9=1
+ConvolutionDepthWise     conv4/dw                 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,38,38,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
+Convolution              conv4                    1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=32768 8=102 9=1
+ConvolutionDepthWise     conv5/dw                 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,38,38,256 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
+Convolution              conv5                    1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=65536 8=102 9=1
+ConvolutionDepthWise     conv6/dw                 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,19,19,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
+Convolution              conv6                    1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=131072 8=102 9=1
+ConvolutionDepthWise     conv7/dw                 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv7                    1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv8/dw                 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv8                    1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv9/dw                 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv9                    1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv10/dw                1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv10                   1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=102 9=1
+ConvolutionDepthWise     conv11/dw                1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv11                   1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 8=2 9=1
+Split                    splitncnn_1              1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 -23330=16,3,19,19,512,3,19,19,512,3,19,19,512,3,19,19,512
+ConvolutionDepthWise     conv12/dw                1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu -23330=4,3,10,10,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
+Convolution              conv12                   1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=524288 8=102 9=1
+ConvolutionDepthWise     conv13/dw                1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,10,10,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
+Convolution              conv13                   1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=1048576 8=2 9=1
+Split                    splitncnn_2              1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 -23330=16,3,10,10,1024,3,10,10,1024,3,10,10,1024,3,10,10,1024
+Convolution              conv14_1                 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu -23330=4,3,10,10,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              conv14_2                 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu -23330=4,3,5,5,512 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1
+Split                    splitncnn_3              1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 -23330=16,3,5,5,512,3,5,5,512,3,5,5,512,3,5,5,512
+Convolution              conv15_1                 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu -23330=4,3,5,5,128 0=128 1=1 5=1 6=65536 8=102 9=1
+Convolution              conv15_2                 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu -23330=4,3,3,3,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
+Split                    splitncnn_4              1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 -23330=16,3,3,3,256,3,3,3,256,3,3,3,256,3,3,3,256
+Convolution              conv16_1                 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu -23330=4,3,3,3,128 0=128 1=1 5=1 6=32768 8=102 9=1
+Convolution              conv16_2                 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
+Split                    splitncnn_5              1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256
+Convolution              conv17_1                 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 8=102 9=1
+Convolution              conv17_2                 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
+Split                    splitncnn_6              1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128
+Convolution              conv11_mbox_loc          1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc -23330=4,3,19,19,12 0=12 1=1 5=1 6=6144 8=2
+Permute                  conv11_mbox_loc_perm     1 1 conv11_mbox_loc conv11_mbox_loc_perm -23330=4,3,12,19,19 0=3
+Flatten                  conv11_mbox_loc_flat     1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat -23330=4,1,4332,1,1
+Convolution              conv11_mbox_conf         1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf -23330=4,3,19,19,63 0=63 1=1 5=1 6=32256 8=2
+Permute                  conv11_mbox_conf_perm    1 1 conv11_mbox_conf conv11_mbox_conf_perm -23330=4,3,63,19,19 0=3
+Flatten                  conv11_mbox_conf_flat    1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat -23330=4,1,22743,1,1
+PriorBox                 conv11_mbox_priorbox     2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23330=4,2,4332,2,1 -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000
+Convolution              conv13_mbox_loc          1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc -23330=4,3,10,10,24 0=24 1=1 5=1 6=24576 8=2
+Permute                  conv13_mbox_loc_perm     1 1 conv13_mbox_loc conv13_mbox_loc_perm -23330=4,3,24,10,10 0=3
+Flatten                  conv13_mbox_loc_flat     1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat -23330=4,1,2400,1,1
+Convolution              conv13_mbox_conf         1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf -23330=4,3,10,10,126 0=126 1=1 5=1 6=129024 8=2
+Permute                  conv13_mbox_conf_perm    1 1 conv13_mbox_conf conv13_mbox_conf_perm -23330=4,3,126,10,10 0=3
+Flatten                  conv13_mbox_conf_flat    1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat -23330=4,1,12600,1,1
+PriorBox                 conv13_mbox_priorbox     2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23330=4,2,2400,2,1 -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
+Convolution              conv14_2_mbox_loc        1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc -23330=4,3,5,5,24 0=24 1=1 5=1 6=12288 8=2
+Permute                  conv14_2_mbox_loc_perm   1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm -23330=4,3,24,5,5 0=3
+Flatten                  conv14_2_mbox_loc_flat   1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat -23330=4,1,600,1,1
+Convolution              conv14_2_mbox_conf       1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf -23330=4,3,5,5,126 0=126 1=1 5=1 6=64512 8=2
+Permute                  conv14_2_mbox_conf_perm  1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm -23330=4,3,126,5,5 0=3
+Flatten                  conv14_2_mbox_conf_flat  1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat -23330=4,1,3150,1,1
+PriorBox                 conv14_2_mbox_priorbox   2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23330=4,2,600,2,1 -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
+Convolution              conv15_2_mbox_loc        1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc -23330=4,3,3,3,24 0=24 1=1 5=1 6=6144 8=2
+Permute                  conv15_2_mbox_loc_perm   1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm -23330=4,3,24,3,3 0=3
+Flatten                  conv15_2_mbox_loc_flat   1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat -23330=4,1,216,1,1
+Convolution              conv15_2_mbox_conf       1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf -23330=4,3,3,3,126 0=126 1=1 5=1 6=32256 8=2
+Permute                  conv15_2_mbox_conf_perm  1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm -23330=4,3,126,3,3 0=3
+Flatten                  conv15_2_mbox_conf_flat  1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat -23330=4,1,1134,1,1
+PriorBox                 conv15_2_mbox_priorbox   2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23330=4,2,216,2,1 -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
+Convolution              conv16_2_mbox_loc        1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc -23330=4,3,2,2,24 0=24 1=1 5=1 6=6144 8=2
+Permute                  conv16_2_mbox_loc_perm   1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm -23330=4,3,24,2,2 0=3
+Flatten                  conv16_2_mbox_loc_flat   1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat -23330=4,1,96,1,1
+Convolution              conv16_2_mbox_conf       1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf -23330=4,3,2,2,126 0=126 1=1 5=1 6=32256 8=2
+Permute                  conv16_2_mbox_conf_perm  1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm -23330=4,3,126,2,2 0=3
+Flatten                  conv16_2_mbox_conf_flat  1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat -23330=4,1,504,1,1
+PriorBox                 conv16_2_mbox_priorbox   2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
+Convolution              conv17_2_mbox_loc        1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc -23330=4,3,1,1,24 0=24 1=1 5=1 6=3072 8=2
+Permute                  conv17_2_mbox_loc_perm   1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm -23330=4,3,24,1,1 0=3
+Flatten                  conv17_2_mbox_loc_flat   1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat -23330=4,1,24,1,1
+Convolution              conv17_2_mbox_conf       1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf -23330=4,3,1,1,126 0=126 1=1 5=1 6=16128 8=2
+Permute                  conv17_2_mbox_conf_perm  1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm -23330=4,3,126,1,1 0=3
+Flatten                  conv17_2_mbox_conf_flat  1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat -23330=4,1,126,1,1
+PriorBox                 conv17_2_mbox_priorbox   2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23330=4,2,24,2,1 -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
+Concat                   mbox_loc                 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc -23330=4,1,7668,1,1
+Concat                   mbox_conf                6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf -23330=4,1,40257,1,1
+Concat                   mbox_priorbox            6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox -23330=4,2,7668,2,1 0=1
+Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,1917,1 0=21 1=-1
+Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,1917,1 0=1 1=1
+Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,40257,1,1
 DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000
diff --git a/benchmark/models/resnet18_int8.param b/benchmark/models/resnet18_int8.param
index a546b5fdf2fe..ed42e90e9d33 100644
--- a/benchmark/models/resnet18_int8.param
+++ b/benchmark/models/resnet18_int8.param
@@ -1,52 +1,52 @@
 7767517
 50 58
-Input                    data                     0 1 data 0=224 1=224 2=3
-Convolution              conv1                    1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
-Pooling                  pool1                    1 1 conv1_conv1_relu pool1 1=3 2=2
-Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
-Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=64 1=1 5=1 6=4096 8=2
-Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
-Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2
-Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1
-ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu
-Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
-Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
-Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2
-Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1
-ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu
-Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
-Convolution              res3a_branch1            1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=128 1=1 3=2 5=1 6=8192 8=2
-Convolution              res3a_branch2a           1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1
-Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2
-Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1
-ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu
-Split                    splitncnn_3              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
-Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
-Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2
-Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1
-ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu
-Split                    splitncnn_4              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
-Convolution              res4a_branch1            1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=256 1=1 3=2 5=1 6=32768 8=2
-Convolution              res4a_branch2a           1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1
-Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2
-Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1
-ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu
-Split                    splitncnn_5              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
-Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2
-Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1
-ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu
-Split                    splitncnn_6              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
-Convolution              res5a_branch1            1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2
-Convolution              res5a_branch2a           1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1
-Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2
-Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1
-ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu
-Split                    splitncnn_7              1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
-Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2
-Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1
-ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu
-Pooling                  pool5                    1 1 res5b_res5b_relu pool5 0=1 1=7
-InnerProduct             fc1000                   1 1 pool5 fc1000 0=1000 1=1 2=512000
-Softmax                  prob                     1 1 fc1000 output
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
+Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2
+Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
+Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=2
+Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
+Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=2
+Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a -23330=4,3,56,56,64 0=1
+ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu -23330=4,3,56,56,64
+Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
+Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
+Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=2
+Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b -23330=4,3,56,56,64 0=1
+ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu -23330=4,3,56,56,64
+Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
+Convolution              res3a_branch1            1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=8192 8=2
+Convolution              res3a_branch2a           1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1
+Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=2
+Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a -23330=4,3,28,28,128 0=1
+ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu -23330=4,3,28,28,128
+Split                    splitncnn_3              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128
+Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
+Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=2
+Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b -23330=4,3,28,28,128 0=1
+ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu -23330=4,3,28,28,128
+Split                    splitncnn_4              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128
+Convolution              res4a_branch1            1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=32768 8=2
+Convolution              res4a_branch2a           1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1
+Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=2
+Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a -23330=4,3,14,14,256 0=1
+ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu -23330=4,3,14,14,256
+Split                    splitncnn_5              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256
+Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=2
+Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b -23330=4,3,14,14,256 0=1
+ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu -23330=4,3,14,14,256
+Split                    splitncnn_6              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256
+Convolution              res5a_branch1            1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=131072 8=2
+Convolution              res5a_branch2a           1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1
+Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=2
+Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a -23330=4,3,7,7,512 0=1
+ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu -23330=4,3,7,7,512
+Split                    splitncnn_7              1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,512,3,7,7,512
+Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=2
+Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b -23330=4,3,7,7,512 0=1
+ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu -23330=4,3,7,7,512
+Pooling                  pool5                    1 1 res5b_res5b_relu pool5 -23330=4,3,1,1,512 0=1 1=7
+InnerProduct             fc1000                   1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=512000
+Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1
diff --git a/benchmark/models/resnet50_int8.param b/benchmark/models/resnet50_int8.param
index 48dfbf4bbd82..9970c12df020 100644
--- a/benchmark/models/resnet50_int8.param
+++ b/benchmark/models/resnet50_int8.param
@@ -1,108 +1,108 @@
 7767517
 106 122
-Input                    data                     0 1 data 0=224 1=224 2=3
-Convolution              conv1                    1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
-Pooling                  pool1                    1 1 conv1_conv1_relu pool1 1=3 2=2
-Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
-Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=256 1=1 5=1 6=16384 8=2
-Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=1 5=1 6=4096 8=102 9=1
-Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
-Convolution              res2a_branch2c           1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c 0=256 1=1 5=1 6=16384 8=2
-Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1
-ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu
-Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
-Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1
-Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
-Convolution              res2b_branch2c           1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c 0=256 1=1 5=1 6=16384 8=2
-Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1
-ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu
-Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
-Convolution              res2c_branch2a           1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1
-Convolution              res2c_branch2b           1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
-Convolution              res2c_branch2c           1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c 0=256 1=1 5=1 6=16384 8=2
-Eltwise                  res2c                    2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1
-ReLU                     res2c_relu               1 1 res2c res2c_res2c_relu
-Split                    splitncnn_3              1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
-Convolution              res3a_branch1            1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2
-Convolution              res3a_branch2a           1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=1 3=2 5=1 6=32768 8=102 9=1
-Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
-Convolution              res3a_branch2c           1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c 0=512 1=1 5=1 6=65536 8=2
-Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1
-ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu
-Split                    splitncnn_4              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
-Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
-Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
-Convolution              res3b_branch2c           1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c 0=512 1=1 5=1 6=65536 8=2
-Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1
-ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu
-Split                    splitncnn_5              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
-Convolution              res3c_branch2a           1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
-Convolution              res3c_branch2b           1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
-Convolution              res3c_branch2c           1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c 0=512 1=1 5=1 6=65536 8=2
-Eltwise                  res3c                    2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1
-ReLU                     res3c_relu               1 1 res3c res3c_res3c_relu
-Split                    splitncnn_6              1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
-Convolution              res3d_branch2a           1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
-Convolution              res3d_branch2b           1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
-Convolution              res3d_branch2c           1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c 0=512 1=1 5=1 6=65536 8=2
-Eltwise                  res3d                    2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1
-ReLU                     res3d_relu               1 1 res3d res3d_res3d_relu
-Split                    splitncnn_7              1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
-Convolution              res4a_branch1            1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=1024 1=1 3=2 5=1 6=524288 8=2
-Convolution              res4a_branch2a           1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=1 3=2 5=1 6=131072 8=102 9=1
-Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4a_branch2c           1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1
-ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu
-Split                    splitncnn_8              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
-Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4b_branch2c           1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1
-ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu
-Split                    splitncnn_9              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
-Convolution              res4c_branch2a           1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              res4c_branch2b           1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4c_branch2c           1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4c                    2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1
-ReLU                     res4c_relu               1 1 res4c res4c_res4c_relu
-Split                    splitncnn_10             1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
-Convolution              res4d_branch2a           1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              res4d_branch2b           1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4d_branch2c           1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4d                    2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1
-ReLU                     res4d_relu               1 1 res4d res4d_res4d_relu
-Split                    splitncnn_11             1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
-Convolution              res4e_branch2a           1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              res4e_branch2b           1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4e_branch2c           1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4e                    2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1
-ReLU                     res4e_relu               1 1 res4e res4e_res4e_relu
-Split                    splitncnn_12             1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
-Convolution              res4f_branch2a           1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
-Convolution              res4f_branch2b           1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              res4f_branch2c           1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c 0=1024 1=1 5=1 6=262144 8=2
-Eltwise                  res4f                    2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1
-ReLU                     res4f_relu               1 1 res4f res4f_res4f_relu
-Split                    splitncnn_13             1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
-Convolution              res5a_branch1            1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=2048 1=1 3=2 5=1 6=2097152 8=2
-Convolution              res5a_branch2a           1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=1 3=2 5=1 6=524288 8=102 9=1
-Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              res5a_branch2c           1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c 0=2048 1=1 5=1 6=1048576 8=2
-Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1
-ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu
-Split                    splitncnn_14             1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
-Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1
-Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              res5b_branch2c           1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c 0=2048 1=1 5=1 6=1048576 8=2
-Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1
-ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu
-Split                    splitncnn_15             1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
-Convolution              res5c_branch2a           1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1
-Convolution              res5c_branch2b           1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              res5c_branch2c           1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c 0=2048 1=1 5=1 6=1048576 8=2
-Eltwise                  res5c                    2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1
-ReLU                     res5c_relu               1 1 res5c res5c_res5c_relu
-Pooling                  pool5                    1 1 res5c_res5c_relu pool5 0=1 1=7
-InnerProduct             fc1000                   1 1 pool5 fc1000 0=1000 1=1 2=2048000
-Softmax                  prob                     1 1 fc1000 output
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
+Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2
+Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
+Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2
+Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 8=102 9=1
+Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
+Convolution              res2a_branch2c           1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2
+Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a -23330=4,3,56,56,256 0=1
+ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu -23330=4,3,56,56,256
+Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
+Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 8=102 9=1
+Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
+Convolution              res2b_branch2c           1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2
+Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b -23330=4,3,56,56,256 0=1
+ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu -23330=4,3,56,56,256
+Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
+Convolution              res2c_branch2a           1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 8=102 9=1
+Convolution              res2c_branch2b           1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
+Convolution              res2c_branch2c           1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384 8=2
+Eltwise                  res2c                    2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c -23330=4,3,56,56,256 0=1
+ReLU                     res2c_relu               1 1 res2c res2c_res2c_relu -23330=4,3,56,56,256
+Split                    splitncnn_3              1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
+Convolution              res3a_branch1            1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,512 0=512 1=1 3=2 5=1 6=131072 8=2
+Convolution              res3a_branch2a           1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=32768 8=102 9=1
+Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
+Convolution              res3a_branch2c           1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2
+Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a -23330=4,3,28,28,512 0=1
+ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu -23330=4,3,28,28,512
+Split                    splitncnn_4              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
+Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1
+Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
+Convolution              res3b_branch2c           1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2
+Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b -23330=4,3,28,28,512 0=1
+ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu -23330=4,3,28,28,512
+Split                    splitncnn_5              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
+Convolution              res3c_branch2a           1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1
+Convolution              res3c_branch2b           1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
+Convolution              res3c_branch2c           1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2
+Eltwise                  res3c                    2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c -23330=4,3,28,28,512 0=1
+ReLU                     res3c_relu               1 1 res3c res3c_res3c_relu -23330=4,3,28,28,512
+Split                    splitncnn_6              1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
+Convolution              res3d_branch2a           1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 8=102 9=1
+Convolution              res3d_branch2b           1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
+Convolution              res3d_branch2c           1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536 8=2
+Eltwise                  res3d                    2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d -23330=4,3,28,28,512 0=1
+ReLU                     res3d_relu               1 1 res3d res3d_res3d_relu -23330=4,3,28,28,512
+Split                    splitncnn_7              1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
+Convolution              res4a_branch1            1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,1024 0=1024 1=1 3=2 5=1 6=524288 8=2
+Convolution              res4a_branch2a           1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=131072 8=102 9=1
+Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4a_branch2c           1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a -23330=4,3,14,14,1024 0=1
+ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu -23330=4,3,14,14,1024
+Split                    splitncnn_8              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4b_branch2c           1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b -23330=4,3,14,14,1024 0=1
+ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu -23330=4,3,14,14,1024
+Split                    splitncnn_9              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res4c_branch2a           1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              res4c_branch2b           1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4c_branch2c           1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4c                    2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c -23330=4,3,14,14,1024 0=1
+ReLU                     res4c_relu               1 1 res4c res4c_res4c_relu -23330=4,3,14,14,1024
+Split                    splitncnn_10             1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res4d_branch2a           1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              res4d_branch2b           1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4d_branch2c           1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4d                    2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d -23330=4,3,14,14,1024 0=1
+ReLU                     res4d_relu               1 1 res4d res4d_res4d_relu -23330=4,3,14,14,1024
+Split                    splitncnn_11             1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res4e_branch2a           1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              res4e_branch2b           1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4e_branch2c           1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4e                    2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e -23330=4,3,14,14,1024 0=1
+ReLU                     res4e_relu               1 1 res4e res4e_res4e_relu -23330=4,3,14,14,1024
+Split                    splitncnn_12             1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res4f_branch2a           1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 8=102 9=1
+Convolution              res4f_branch2b           1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              res4f_branch2c           1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144 8=2
+Eltwise                  res4f                    2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f -23330=4,3,14,14,1024 0=1
+ReLU                     res4f_relu               1 1 res4f res4f_res4f_relu -23330=4,3,14,14,1024
+Split                    splitncnn_13             1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
+Convolution              res5a_branch1            1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,2048 0=2048 1=1 3=2 5=1 6=2097152 8=2
+Convolution              res5a_branch2a           1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=524288 8=102 9=1
+Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              res5a_branch2c           1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2
+Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a -23330=4,3,7,7,2048 0=1
+ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu -23330=4,3,7,7,2048
+Split                    splitncnn_14             1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048
+Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 8=102 9=1
+Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              res5b_branch2c           1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2
+Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b -23330=4,3,7,7,2048 0=1
+ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu -23330=4,3,7,7,2048
+Split                    splitncnn_15             1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048
+Convolution              res5c_branch2a           1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 8=102 9=1
+Convolution              res5c_branch2b           1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              res5c_branch2c           1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576 8=2
+Eltwise                  res5c                    2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c -23330=4,3,7,7,2048 0=1
+ReLU                     res5c_relu               1 1 res5c res5c_res5c_relu -23330=4,3,7,7,2048
+Pooling                  pool5                    1 1 res5c_res5c_relu pool5 -23330=4,3,1,1,2048 0=1 1=7
+InnerProduct             fc1000                   1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=2048000
+Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1
diff --git a/benchmark/models/squeezenet_int8.param b/benchmark/models/squeezenet_int8.param
index c12699380517..b29447ee40f0 100644
--- a/benchmark/models/squeezenet_int8.param
+++ b/benchmark/models/squeezenet_int8.param
@@ -1,50 +1,50 @@
 7767517
 48 56
-Input                    data                     0 1 data 0=227 1=227 2=3
-Convolution              conv1                    1 1 data conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
-Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 1=3 2=2
-Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1
-Split                    splitncnn_0              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
-Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
-Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
-Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
-Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1
-Split                    splitncnn_1              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
-Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
-Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
-Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
-Pooling                  pool3                    1 1 fire3/concat pool3 1=3 2=2
-Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1
-Split                    splitncnn_2              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
-Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
-Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
-Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
-Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1
-Split                    splitncnn_3              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
-Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
-Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
-Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
-Pooling                  pool5                    1 1 fire5/concat pool5 1=3 2=2
-Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1
-Split                    splitncnn_4              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
-Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
-Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
-Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
-Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1
-Split                    splitncnn_5              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
-Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
-Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
-Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
-Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1
-Split                    splitncnn_6              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
-Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
-Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
-Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
-Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1
-Split                    splitncnn_7              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
-Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
-Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
-Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9
-Convolution              conv10                   1 1 fire9/concat_drop9 conv10_relu_conv10 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1
-Pooling                  pool10                   1 1 conv10_relu_conv10 pool10 0=1 4=1
-Softmax                  prob                     1 1 pool10 output
+Input                    data                     0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3
+Convolution              conv1                    1 1 data conv1_relu_conv1 -23330=4,3,113,113,64 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
+Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 -23330=4,3,56,56,64 1=3 2=2
+Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=1024 8=102 9=1
+Split                    splitncnn_0              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16
+Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 8=2 9=1
+Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
+Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,56,56,128
+Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=2048 8=102 9=1
+Split                    splitncnn_1              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16
+Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 8=2 9=1
+Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
+Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,56,56,128
+Pooling                  pool3                    1 1 fire3/concat pool3 -23330=4,3,28,28,128 1=3 2=2
+Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=4096 8=102 9=1
+Split                    splitncnn_2              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
+Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 8=2 9=1
+Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
+Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,28,28,256
+Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 8=102 9=1
+Split                    splitncnn_3              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
+Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 8=2 9=1
+Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
+Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,28,28,256
+Pooling                  pool5                    1 1 fire5/concat pool5 -23330=4,3,14,14,256 1=3 2=2
+Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=12288 8=102 9=1
+Split                    splitncnn_4              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48
+Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 8=2 9=1
+Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
+Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,14,14,384
+Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=18432 8=102 9=1
+Split                    splitncnn_5              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48
+Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 8=2 9=1
+Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
+Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,14,14,384
+Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 8=102 9=1
+Split                    splitncnn_6              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
+Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 8=2 9=1
+Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
+Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,14,14,512
+Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 8=102 9=1
+Split                    splitncnn_7              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
+Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 8=2 9=1
+Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
+Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 -23330=4,3,14,14,512
+Convolution              conv10                   1 1 fire9/concat_drop9 conv10_relu_conv10 -23330=4,3,16,16,1000 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1
+Pooling                  pool10                   1 1 conv10_relu_conv10 pool10 -23330=4,1,1000,1,1 0=1 4=1
+Softmax                  prob                     1 1 pool10 output -23330=4,1,1000,1,1
diff --git a/benchmark/models/squeezenet_ssd_int8.param b/benchmark/models/squeezenet_ssd_int8.param
index 177d18729cd3..050166202d62 100644
--- a/benchmark/models/squeezenet_ssd_int8.param
+++ b/benchmark/models/squeezenet_ssd_int8.param
@@ -1,121 +1,121 @@
 7767517
 119 152
-Input                    data                     0 1 data 0=300 1=300 2=3
-Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
-Convolution              conv1                    1 1 data_splitncnn_6 conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
-Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 1=3 2=2
-Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1
-Split                    splitncnn_1              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
-Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
-Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
-Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
-Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1
-Split                    splitncnn_2              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
-Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
-Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
-Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
-Pooling                  pool3                    1 1 fire3/concat pool3 1=3 2=2
-Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1
-Split                    splitncnn_3              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
-Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
-Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
-Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
-Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1
-Split                    splitncnn_4              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
-Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
-Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
-Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
-Split                    splitncnn_5              1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1
-Pooling                  pool5                    1 1 fire5/concat_splitncnn_1 pool5 1=3 2=2
-Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1
-Split                    splitncnn_6              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
-Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
-Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
-Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
-Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1
-Split                    splitncnn_7              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
-Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
-Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
-Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
-Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1
-Split                    splitncnn_8              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
-Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
-Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
-Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
-Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1
-Split                    splitncnn_9              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
-Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
-Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
-Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
-Split                    splitncnn_10             1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3
-Pooling                  pool9                    1 1 fire9/concat_splitncnn_3 pool9 1=3 2=2
-Convolution              fire10/squeeze1x1        1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 0=96 1=1 5=1 6=49152 8=102 9=1
-Split                    splitncnn_11             1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1
-Convolution              fire10/expand1x1         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1
-Convolution              fire10/expand3x3         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
-Concat                   fire10/concat            2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat
-Split                    splitncnn_12             1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3
-Pooling                  pool10                   1 1 fire10/concat_splitncnn_3 pool10 1=3 2=2
-Convolution              fire11/squeeze1x1        1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 0=96 1=1 5=1 6=73728 8=102 9=1
-Split                    splitncnn_13             1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1
-Convolution              fire11/expand1x1         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1
-Convolution              fire11/expand3x3         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
-Concat                   fire11/concat            2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat
-Split                    splitncnn_14             1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3
-Convolution              conv12_1                 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu 0=128 1=1 5=1 6=98304 8=102 9=1
-Convolution              conv12_2                 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
-Split                    splitncnn_15             1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3
-Convolution              conv13_1                 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1
-Convolution              conv13_2                 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
-Split                    splitncnn_16             1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2
-BatchNorm                fire5/bn                 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale 0=256
-Split                    splitncnn_17             1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2
-Convolution              fire5_mbox_loc           1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 4=1 5=1 6=36864 8=2
-Permute                  fire5_mbox_loc_perm      1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3
-Flatten                  fire5_mbox_loc_flat      1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat
-Convolution              fire5_mbox_conf          1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 4=1 5=1 6=193536 8=2
-Permute                  fire5_mbox_conf_perm     1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3
-Flatten                  fire5_mbox_conf_flat     1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat
-PriorBox                 fire5_mbox_priorbox      2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
-Convolution              fire9_mbox_loc           1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 4=1 5=1 6=110592 8=2
-Permute                  fire9_mbox_loc_perm      1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3
-Flatten                  fire9_mbox_loc_flat      1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat
-Convolution              fire9_mbox_conf          1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 4=1 5=1 6=580608 8=2
-Permute                  fire9_mbox_conf_perm     1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3
-Flatten                  fire9_mbox_conf_flat     1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat
-PriorBox                 fire9_mbox_priorbox      2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
-Convolution              fire10_mbox_loc          1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2
-Permute                  fire10_mbox_loc_perm     1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3
-Flatten                  fire10_mbox_loc_flat     1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat
-Convolution              fire10_mbox_conf         1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2
-Permute                  fire10_mbox_conf_perm    1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3
-Flatten                  fire10_mbox_conf_flat    1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat
-PriorBox                 fire10_mbox_priorbox     2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
-Convolution              fire11_mbox_loc          1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2
-Permute                  fire11_mbox_loc_perm     1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3
-Flatten                  fire11_mbox_loc_flat     1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat
-Convolution              fire11_mbox_conf         1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2
-Permute                  fire11_mbox_conf_perm    1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3
-Flatten                  fire11_mbox_conf_flat    1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat
-PriorBox                 fire11_mbox_priorbox     2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
-Convolution              conv12_2_mbox_loc        1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 4=1 5=1 6=55296 8=2
-Permute                  conv12_2_mbox_loc_perm   1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3
-Flatten                  conv12_2_mbox_loc_flat   1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat
-Convolution              conv12_2_mbox_conf       1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 4=1 5=1 6=290304 8=2
-Permute                  conv12_2_mbox_conf_perm  1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3
-Flatten                  conv12_2_mbox_conf_flat  1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat
-PriorBox                 conv12_2_mbox_priorbox   2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
-Convolution              conv13_2_mbox_loc        1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 4=1 5=1 6=18432 8=2
-Permute                  conv13_2_mbox_loc_perm   1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3
-Flatten                  conv13_2_mbox_loc_flat   1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat
-Convolution              conv13_2_mbox_conf       1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 4=1 5=1 6=96768 8=2
-Permute                  conv13_2_mbox_conf_perm  1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3
-Flatten                  conv13_2_mbox_conf_flat  1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat
-PriorBox                 conv13_2_mbox_priorbox   2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
-Concat                   mbox_loc                 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc
-Concat                   mbox_conf                6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf
-Concat                   mbox_priorbox            6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1
-Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape 0=21 1=-1
-Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1
-Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten
+Input                    data                     0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3
+Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3
+Convolution              conv1                    1 1 data_splitncnn_6 conv1_relu_conv1 -23330=4,3,149,149,64 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
+Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 -23330=4,3,74,74,64 1=3 2=2
+Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=1024 8=102 9=1
+Split                    splitncnn_1              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16
+Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 8=2 9=1
+Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
+Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,74,74,128
+Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=2048 8=102 9=1
+Split                    splitncnn_2              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16
+Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 8=2 9=1
+Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
+Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,74,74,128
+Pooling                  pool3                    1 1 fire3/concat pool3 -23330=4,3,37,37,128 1=3 2=2
+Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=4096 8=102 9=1
+Split                    splitncnn_3              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32
+Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 8=2 9=1
+Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
+Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,37,37,256
+Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=8192 8=102 9=1
+Split                    splitncnn_4              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32
+Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 8=2 9=1
+Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
+Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,37,37,256
+Split                    splitncnn_5              1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 -23330=8,3,37,37,256,3,37,37,256
+Pooling                  pool5                    1 1 fire5/concat_splitncnn_1 pool5 -23330=4,3,18,18,256 1=3 2=2
+Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=12288 8=102 9=1
+Split                    splitncnn_6              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48
+Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 8=2 9=1
+Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
+Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,18,18,384
+Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=18432 8=102 9=1
+Split                    splitncnn_7              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48
+Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 8=2 9=1
+Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
+Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,18,18,384
+Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=24576 8=102 9=1
+Split                    splitncnn_8              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64
+Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 8=2 9=1
+Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
+Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,18,18,512
+Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=32768 8=102 9=1
+Split                    splitncnn_9              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64
+Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 8=2 9=1
+Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
+Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat -23330=4,3,18,18,512
+Split                    splitncnn_10             1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 -23330=16,3,18,18,512,3,18,18,512,3,18,18,512,3,18,18,512
+Pooling                  pool9                    1 1 fire9/concat_splitncnn_3 pool9 -23330=4,3,9,9,512 1=3 2=2
+Convolution              fire10/squeeze1x1        1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 -23330=4,3,9,9,96 0=96 1=1 5=1 6=49152 8=102 9=1
+Split                    splitncnn_11             1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 -23330=8,3,9,9,96,3,9,9,96
+Convolution              fire10/expand1x1         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 -23330=4,3,9,9,384 0=384 1=1 5=1 6=36864 8=2 9=1
+Convolution              fire10/expand3x3         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 -23330=4,3,9,9,384 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
+Concat                   fire10/concat            2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat -23330=4,3,9,9,768
+Split                    splitncnn_12             1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 -23330=16,3,9,9,768,3,9,9,768,3,9,9,768,3,9,9,768
+Pooling                  pool10                   1 1 fire10/concat_splitncnn_3 pool10 -23330=4,3,4,4,768 1=3 2=2
+Convolution              fire11/squeeze1x1        1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 -23330=4,3,4,4,96 0=96 1=1 5=1 6=73728 8=102 9=1
+Split                    splitncnn_13             1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 -23330=8,3,4,4,96,3,4,4,96
+Convolution              fire11/expand1x1         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 -23330=4,3,4,4,384 0=384 1=1 5=1 6=36864 8=2 9=1
+Convolution              fire11/expand3x3         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 -23330=4,3,4,4,384 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
+Concat                   fire11/concat            2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat -23330=4,3,4,4,768
+Split                    splitncnn_14             1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 -23330=16,3,4,4,768,3,4,4,768,3,4,4,768,3,4,4,768
+Convolution              conv12_1                 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu -23330=4,3,4,4,128 0=128 1=1 5=1 6=98304 8=102 9=1
+Convolution              conv12_2                 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
+Split                    splitncnn_15             1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256
+Convolution              conv13_1                 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 8=102 9=1
+Convolution              conv13_2                 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
+Split                    splitncnn_16             1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128
+BatchNorm                fire5/bn                 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale -23330=4,3,37,37,256 0=256
+Split                    splitncnn_17             1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 -23330=12,3,37,37,256,3,37,37,256,3,37,37,256
+Convolution              fire5_mbox_loc           1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc -23330=4,3,37,37,16 0=16 1=3 4=1 5=1 6=36864 8=2
+Permute                  fire5_mbox_loc_perm      1 1 fire5_mbox_loc fire5_mbox_loc_perm -23330=4,3,16,37,37 0=3
+Flatten                  fire5_mbox_loc_flat      1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat -23330=4,1,21904,1,1
+Convolution              fire5_mbox_conf          1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf -23330=4,3,37,37,84 0=84 1=3 4=1 5=1 6=193536 8=2
+Permute                  fire5_mbox_conf_perm     1 1 fire5_mbox_conf fire5_mbox_conf_perm -23330=4,3,84,37,37 0=3
+Flatten                  fire5_mbox_conf_flat     1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat -23330=4,1,114996,1,1
+PriorBox                 fire5_mbox_priorbox      2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23330=4,2,21904,2,1 -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
+Convolution              fire9_mbox_loc           1 1 fire9/concat_splitncnn_2 fire9_mbox_loc -23330=4,3,18,18,24 0=24 1=3 4=1 5=1 6=110592 8=2
+Permute                  fire9_mbox_loc_perm      1 1 fire9_mbox_loc fire9_mbox_loc_perm -23330=4,3,24,18,18 0=3
+Flatten                  fire9_mbox_loc_flat      1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat -23330=4,1,7776,1,1
+Convolution              fire9_mbox_conf          1 1 fire9/concat_splitncnn_1 fire9_mbox_conf -23330=4,3,18,18,126 0=126 1=3 4=1 5=1 6=580608 8=2
+Permute                  fire9_mbox_conf_perm     1 1 fire9_mbox_conf fire9_mbox_conf_perm -23330=4,3,126,18,18 0=3
+Flatten                  fire9_mbox_conf_flat     1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat -23330=4,1,40824,1,1
+PriorBox                 fire9_mbox_priorbox      2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23330=4,2,7776,2,1 -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
+Convolution              fire10_mbox_loc          1 1 fire10/concat_splitncnn_2 fire10_mbox_loc -23330=4,3,9,9,24 0=24 1=3 4=1 5=1 6=165888 8=2
+Permute                  fire10_mbox_loc_perm     1 1 fire10_mbox_loc fire10_mbox_loc_perm -23330=4,3,24,9,9 0=3
+Flatten                  fire10_mbox_loc_flat     1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat -23330=4,1,1944,1,1
+Convolution              fire10_mbox_conf         1 1 fire10/concat_splitncnn_1 fire10_mbox_conf -23330=4,3,9,9,126 0=126 1=3 4=1 5=1 6=870912 8=2
+Permute                  fire10_mbox_conf_perm    1 1 fire10_mbox_conf fire10_mbox_conf_perm -23330=4,3,126,9,9 0=3
+Flatten                  fire10_mbox_conf_flat    1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat -23330=4,1,10206,1,1
+PriorBox                 fire10_mbox_priorbox     2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23330=4,2,1944,2,1 -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
+Convolution              fire11_mbox_loc          1 1 fire11/concat_splitncnn_2 fire11_mbox_loc -23330=4,3,4,4,24 0=24 1=3 4=1 5=1 6=165888 8=2
+Permute                  fire11_mbox_loc_perm     1 1 fire11_mbox_loc fire11_mbox_loc_perm -23330=4,3,24,4,4 0=3
+Flatten                  fire11_mbox_loc_flat     1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat -23330=4,1,384,1,1
+Convolution              fire11_mbox_conf         1 1 fire11/concat_splitncnn_1 fire11_mbox_conf -23330=4,3,4,4,126 0=126 1=3 4=1 5=1 6=870912 8=2
+Permute                  fire11_mbox_conf_perm    1 1 fire11_mbox_conf fire11_mbox_conf_perm -23330=4,3,126,4,4 0=3
+Flatten                  fire11_mbox_conf_flat    1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat -23330=4,1,2016,1,1
+PriorBox                 fire11_mbox_priorbox     2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23330=4,2,384,2,1 -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
+Convolution              conv12_2_mbox_loc        1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc -23330=4,3,2,2,24 0=24 1=3 4=1 5=1 6=55296 8=2
+Permute                  conv12_2_mbox_loc_perm   1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm -23330=4,3,24,2,2 0=3
+Flatten                  conv12_2_mbox_loc_flat   1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat -23330=4,1,96,1,1
+Convolution              conv12_2_mbox_conf       1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf -23330=4,3,2,2,126 0=126 1=3 4=1 5=1 6=290304 8=2
+Permute                  conv12_2_mbox_conf_perm  1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm -23330=4,3,126,2,2 0=3
+Flatten                  conv12_2_mbox_conf_flat  1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat -23330=4,1,504,1,1
+PriorBox                 conv12_2_mbox_priorbox   2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
+Convolution              conv13_2_mbox_loc        1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc -23330=4,3,1,1,16 0=16 1=3 4=1 5=1 6=18432 8=2
+Permute                  conv13_2_mbox_loc_perm   1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm -23330=4,3,16,1,1 0=3
+Flatten                  conv13_2_mbox_loc_flat   1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat -23330=4,1,16,1,1
+Convolution              conv13_2_mbox_conf       1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf -23330=4,3,1,1,84 0=84 1=3 4=1 5=1 6=96768 8=2
+Permute                  conv13_2_mbox_conf_perm  1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm -23330=4,3,84,1,1 0=3
+Flatten                  conv13_2_mbox_conf_flat  1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat -23330=4,1,84,1,1
+PriorBox                 conv13_2_mbox_priorbox   2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23330=4,2,16,2,1 -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
+Concat                   mbox_loc                 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc -23330=4,1,32120,1,1
+Concat                   mbox_conf                6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf -23330=4,1,168630,1,1
+Concat                   mbox_priorbox            6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox -23330=4,2,32120,2,1 0=1
+Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,8030,1 0=21 1=-1
+Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,8030,1 0=1 1=1
+Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,168630,1,1
 DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000
diff --git a/benchmark/models/vgg16_int8.param b/benchmark/models/vgg16_int8.param
index fa2aff591ada..37159f47ce98 100644
--- a/benchmark/models/vgg16_int8.param
+++ b/benchmark/models/vgg16_int8.param
@@ -1,25 +1,25 @@
 7767517
 23 23
-Input                    data                     0 1 data 0=224 1=224 2=3
-Convolution              conv1_1                  1 1 data conv1_1_relu1_1 0=64 1=3 4=1 5=1 6=1728 8=102 9=1
-Convolution              conv1_2                  1 1 conv1_1_relu1_1 conv1_2_relu1_2 0=64 1=3 4=1 5=1 6=36864 8=2 9=1
-Pooling                  pool1                    1 1 conv1_2_relu1_2 pool1 1=2 2=2
-Convolution              conv2_1                  1 1 pool1 conv2_1_relu2_1 0=128 1=3 4=1 5=1 6=73728 8=102 9=1
-Convolution              conv2_2                  1 1 conv2_1_relu2_1 conv2_2_relu2_2 0=128 1=3 4=1 5=1 6=147456 8=2 9=1
-Pooling                  pool2                    1 1 conv2_2_relu2_2 pool2 1=2 2=2
-Convolution              conv3_1                  1 1 pool2 conv3_1_relu3_1 0=256 1=3 4=1 5=1 6=294912 8=102 9=1
-Convolution              conv3_2                  1 1 conv3_1_relu3_1 conv3_2_relu3_2 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
-Convolution              conv3_3                  1 1 conv3_2_relu3_2 conv3_3_relu3_3 0=256 1=3 4=1 5=1 6=589824 8=2 9=1
-Pooling                  pool3                    1 1 conv3_3_relu3_3 pool3 1=2 2=2
-Convolution              conv4_1                  1 1 pool3 conv4_1_relu4_1 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1
-Convolution              conv4_2                  1 1 conv4_1_relu4_1 conv4_2_relu4_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              conv4_3                  1 1 conv4_2_relu4_2 conv4_3_relu4_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
-Pooling                  pool4                    1 1 conv4_3_relu4_3 pool4 1=2 2=2
-Convolution              conv5_1                  1 1 pool4 conv5_1_relu5_1 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              conv5_2                  1 1 conv5_1_relu5_1 conv5_2_relu5_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
-Convolution              conv5_3                  1 1 conv5_2_relu5_2 conv5_3_relu5_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
-Pooling                  pool5                    1 1 conv5_3_relu5_3 pool5 1=2 2=2
-InnerProduct             fc6                      1 1 pool5 fc6_drop6 0=4096 1=1 2=102760448 8=2 9=1
-InnerProduct             fc7                      1 1 fc6_drop6 fc7_drop7 0=4096 1=1 2=16777216 8=2 9=1
-InnerProduct             fc8                      1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 8=2
-Softmax                  prob                     1 1 fc8 output
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              conv1_1                  1 1 data conv1_1_relu1_1 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=1728 8=102 9=1
+Convolution              conv1_2                  1 1 conv1_1_relu1_1 conv1_2_relu1_2 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=36864 8=2 9=1
+Pooling                  pool1                    1 1 conv1_2_relu1_2 pool1 -23330=4,3,112,112,64 1=2 2=2
+Convolution              conv2_1                  1 1 pool1 conv2_1_relu2_1 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=73728 8=102 9=1
+Convolution              conv2_2                  1 1 conv2_1_relu2_1 conv2_2_relu2_2 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=147456 8=2 9=1
+Pooling                  pool2                    1 1 conv2_2_relu2_2 pool2 -23330=4,3,56,56,128 1=2 2=2
+Convolution              conv3_1                  1 1 pool2 conv3_1_relu3_1 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=294912 8=102 9=1
+Convolution              conv3_2                  1 1 conv3_1_relu3_1 conv3_2_relu3_2 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
+Convolution              conv3_3                  1 1 conv3_2_relu3_2 conv3_3_relu3_3 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 8=2 9=1
+Pooling                  pool3                    1 1 conv3_3_relu3_3 pool3 -23330=4,3,28,28,256 1=2 2=2
+Convolution              conv4_1                  1 1 pool3 conv4_1_relu4_1 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1
+Convolution              conv4_2                  1 1 conv4_1_relu4_1 conv4_2_relu4_2 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              conv4_3                  1 1 conv4_2_relu4_2 conv4_3_relu4_3 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
+Pooling                  pool4                    1 1 conv4_3_relu4_3 pool4 -23330=4,3,14,14,512 1=2 2=2
+Convolution              conv5_1                  1 1 pool4 conv5_1_relu5_1 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              conv5_2                  1 1 conv5_1_relu5_1 conv5_2_relu5_2 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
+Convolution              conv5_3                  1 1 conv5_2_relu5_2 conv5_3_relu5_3 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
+Pooling                  pool5                    1 1 conv5_3_relu5_3 pool5 -23330=4,3,7,7,512 1=2 2=2
+InnerProduct             fc6                      1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=102760448 8=2 9=1
+InnerProduct             fc7                      1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 8=2 9=1
+InnerProduct             fc8                      1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000 8=2
+Softmax                  prob                     1 1 fc8 output -23330=4,1,1000,1,1
diff --git a/docs/developer-guide/glsl-extension.md b/docs/developer-guide/glsl-extension.md
index 110fa4fafa4b..6fad73c4a539 100644
--- a/docs/developer-guide/glsl-extension.md
+++ b/docs/developer-guide/glsl-extension.md
@@ -166,6 +166,41 @@ shared lfp tmp_a[8][4][2];
 |lfp|float|float|float|float16_t|float|bfloat16_t|
 |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4|
 
+## integer type
+
+declare int8/int16 buffer data layout and local variables in glsl code
+
+```c
+layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer weight_blob { sint16 weight_blob_data[]; };
+```
+
+|int8 storage type|int8p|int8s|int8s+int8a|
+|---|---|---|---|
+|sint8|int|int8_t|int8_t|
+|sint8vec4|int|int|int|
+
+|int8 arithmetic type|int8|
+|---|---|
+|aint8|int|
+|aint8vec4|ivec4|
+
+|int16 arithmetic type|int16|
+|---|---|
+|aint16|int16_t when shaderInt16 is available, otherwise int|
+|aint16vec4|i16vec4 when shaderInt16 is available, otherwise ivec4|
+
+|int16 storage/local type|int16p|int16s|
+|---|---|---|
+|sint16|int|int16_t|
+|sint16vec4|ivec2|i16vec4|
+|lint16|int|int16_t|
+|lint16vec4|ivec2|i16vec4|
+
+`sint8vec4` uses one `int` to hold four signed int8 lanes in all int8 storage modes. This keeps pack4 data in packed form for integer dot-product and shared-memory paths. Use `i8buffer_ld4` to unpack it to `ivec4`, and use `i8buffer_sm4` to load the raw packed `int`.
+
+`sint16` uses one `int` to hold two signed int16 lanes when `opt.use_int16_packed` is enabled, and uses native `int16_t` when `opt.use_int16_storage` is enabled. `sint16vec4` stores four logical int16 lanes as two packed `int` values in int16p mode and as native `i16vec4` in int16s mode. `lint16` and `lint16vec4` are the shared/local-memory counterparts.
+
 # buffer functions
 
 - load typed value from src[offset]
@@ -203,6 +238,77 @@ void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets);
 ```c
 void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset);
 ```
+
+# integer buffer functions
+
+- load integer typed value from src[offset]
+
+```c
+aint8 i8buffer_ld1(sint8 src, int offset);
+aint8vec4 i8buffer_ld4(sint8vec4 src, int offset);
+int i8buffer_sm4(sint8vec4 src, int offset);
+int i16buffer_ld1(sint16 src, int offset);
+ivec2 i16buffer_ld2(sint16 src, int offset);
+sint16vec4 i16buffer_sm4(sint16vec4 src, int offset);
+aint16vec4 i16buffer_ld4(sint16vec4 src, int offset);
+aint16 lint162aint16(lint16 v);
+aint16vec4 lint162aint16vec4(lint16vec4 v);
+```
+
+`i8buffer_sm4` loads the raw packed `int` representation of four int8 lanes. It is useful for shared-memory staging and `dotPacked4x8EXT` paths where unpacking to `ivec4` would be wasteful.
+
+`i16buffer_ld1` and `i16buffer_ld2` load signed int16 lanes as `int` and `ivec2`. Without native int16 storage, `offset` is still the logical int16 lane offset, and packed storage groups two adjacent lanes in one `int`.
+
+`i16buffer_sm4` loads the raw `sint16vec4` representation of four logical int16 lanes from buffer storage. `i16buffer_ld4` loads four logical int16 lanes from buffer storage as `aint16vec4`. `lint162aint16` and `lint162aint16vec4` convert shared/local int16 values to arithmetic int16 values.
+
+- store integer typed value to dst[offset]
+
+```c
+void i8buffer_st1(sint8 dst, int offset, aint8 v);
+void i8buffer_st4(sint8vec4 dst, int offset, aint8vec4 v);
+void i16buffer_st1(sint16 dst, int offset, int v);
+void i16buffer_st2(sint16 dst, int offset, ivec2 v);
+void i16buffer_st4(sint16vec4 dst, int offset, ivec4 v);
+void i16buffer_st4(lint16vec4 dst, int offset, ivec4 v);
+```
+
+Without native int8 storage, `i8buffer_st1` updates one byte lane inside a packed `int` and may use an atomic compare-and-swap loop.
+
+Without native int16 storage, `i16buffer_st1` updates one int16 lane inside a packed `int` and may use an atomic compare-and-swap loop. `i16buffer_st2` stores complete packed words directly when `offset` is aligned. `i16buffer_st4` writes four logical int16 lanes to `sint16vec4` storage or `lint16vec4` shared/local memory.
+
+- copy int8 typed value from src[src_offset] to dst[dst_offset]
+
+```c
+void i8buffer_cp1(sint8 dst, int dst_offset, sint8 src, int src_offset);
+void i8buffer_cp4(sint8vec4 dst, int dst_offset, sint8vec4 src, int src_offset);
+```
+
+- copy and pack int8 typed values from src[src_offsets[0],src_offsets[1],...] to dst[dst_offset]
+
+```c
+void i8buffer_cp1to4(sint8vec4 dst, int dst_offset, sint8 src, ivec4 src_offsets);
+```
+
+- copy and unpack int8 typed values from src[src_offset] to dst[dst_offsets[0],dst_offsets[1],...]
+
+```c
+void i8buffer_cp4to1(sint8 dst, ivec4 dst_offsets, sint8vec4 src, int src_offset);
+```
+
+- pack and unpack signed integer lanes
+
+```c
+ivec4 unpackInt4x8(int v);
+int packInt4x8(ivec4 v);
+ivec2 unpackInt2x16(int v);
+int packInt2x16(ivec2 v);
+int float2int8(float v);
+ivec4 float2int8vec4(vec4 v);
+```
+
+`packInt4x8` stores `.r/.g/.b/.a` in the low-to-high bytes of one `int`. `packInt2x16` stores `.r/.g` in the low-to-high 16-bit lanes of one `int`.
+`float2int8` and `float2int8vec4` round half away from zero and saturate to [-127, 127] for deterministic int8 quantization.
+
 # local data conversion functions
 
 - storage buffer to local memory
@@ -314,6 +420,16 @@ void main()
     // here is the code path optimized for subgroup_size == 32
 #endif
 
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+    // here is the packed int8 dot-product path
+#endif
+
+#if ncnn_VK_KHR_cooperative_matrix
+    // here is the KHR cooperative matrix path
+#elif ncnn_VK_NV_cooperative_matrix
+    // here is the NV cooperative matrix path
+#endif
+
     // use macro definitions
     uint size; // dynamic value from some previous routines
     if (size < ncnn_subgroupSize)
@@ -329,6 +445,12 @@ void main()
 }
 ```
 
+Cooperative matrix shape and component-type combinations are selected on the host side. Use `GpuInfo::support_cooperative_matrix()`, `GpuInfo::support_int8_cooperative_matrix()`, `GpuInfo::support_bf16_cooperative_matrix()`, and `GpuInfo::get_optimal_cooperative_matrix_mnk()` before creating a cooperative matrix pipeline.
+
+For signed int8 cooperative matrix kernels, ncnn requires signed int8 A/B and signed int32 accumulator/result cooperative matrix support at subgroup scope. The shader still uses the normal `ncnn_VK_KHR_cooperative_matrix` / `ncnn_VK_NV_cooperative_matrix` extension macros to select GLSL syntax, while the host selects this path with `support_int8_cooperative_matrix()`.
+
+In int8 cooperative matrix and integer dot-product shaders, prefer keeping pack4 data in the packed `sint8vec4` representation and use `i8buffer_sm4` for shared-memory staging when the layout is already packed. Use `i8buffer_ld4` only when arithmetic needs unpacked `ivec4` lanes.
+
 ### validation layer macros
 
 ncnn will define some additional convenient macros when the vulkan validation layer enabled
@@ -357,7 +479,7 @@ At runtime, `NCNN_LOGE` will print out the value of `gx`
 
 enable glsl extension only if user enable some options
 
-The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage` or `opt.use_bf16_storage`
+The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage`, `opt.use_bf16_storage`, or `opt.use_int16_storage`
 
 The `GL_EXT_shader_explicit_arithmetic_types_float16` extension will be automatically enabled without explicit code indication when the device supports 16-bit arithmetic and the user turns on `opt.use_fp16_arithmetic`
 
@@ -388,6 +510,10 @@ void main()
 |NCNN_int8_packed|opt.use_int8_packed|
 |NCNN_int8_storage|opt.use_int8_storage|
 |NCNN_int8_arithmetic|opt.use_int8_arithmetic|
+|NCNN_int16_packed|opt.use_int16_packed|
+|NCNN_int16_storage|opt.use_int16_storage|
 |NCNN_bf16_packed|opt.use_bf16_packed|
 |NCNN_bf16_storage|opt.use_bf16_storage|
+|NCNN_fp16_uniform|opt.use_fp16_uniform|
+|NCNN_int8_uniform|opt.use_int8_uniform|
 |NCNN_shader_local_memory|opt.use_shader_local_memory|
diff --git a/docs/developer-guide/glsl-extension.zh.md b/docs/developer-guide/glsl-extension.zh.md
index c586c784635e..290f8be65bae 100644
--- a/docs/developer-guide/glsl-extension.zh.md
+++ b/docs/developer-guide/glsl-extension.zh.md
@@ -166,6 +166,41 @@ shared lfp tmp_a[8][4][2];
 |lfp|float|float|float|float16_t|float|bfloat16_t|
 |lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4|
 
+## 整数类型(integer type)
+
+在 GLSL 代码中声明 int8/int16 缓冲区数据布局和局部变量
+
+```c
+layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer weight_blob { sint16 weight_blob_data[]; };
+```
+
+|int8 存储类型|int8p|int8s|int8s+int8a|
+|---|---|---|---|
+|sint8|int|int8_t|int8_t|
+|sint8vec4|int|int|int|
+
+|int8 算术类型|int8|
+|---|---|
+|aint8|int|
+|aint8vec4|ivec4|
+
+|int16 算术类型|int16|
+|---|---|
+|aint16|shaderInt16 可用时为 int16_t，否则为 int|
+|aint16vec4|shaderInt16 可用时为 i16vec4，否则为 ivec4|
+
+|int16 存储/local 类型|int16p|int16s|
+|---|---|---|
+|sint16|int|int16_t|
+|sint16vec4|ivec2|i16vec4|
+|lint16|int|int16_t|
+|lint16vec4|ivec2|i16vec4|
+
+`sint8vec4` 在所有 int8 存储模式下都使用一个 `int` 保存四个有符号 int8 lane。这样可以让 pack4 数据在 integer dot product 和 shared memory 路径中保持 packed 形式。使用 `i8buffer_ld4` 解包为 `ivec4`，使用 `i8buffer_sm4` 直接加载 packed `int`。
+
+启用 `opt.use_int16_packed` 时，`sint16` 使用一个 `int` 保存两个有符号 int16 lane；启用 `opt.use_int16_storage` 时，`sint16` 使用原生 `int16_t`。`sint16vec4` 在 int16p 模式下使用两个 packed `int` 保存四个逻辑 int16 lane，在 int16s 模式下使用原生 `i16vec4`。`lint16` 和 `lint16vec4` 是 shared/local memory 对应类型。
+
 # 缓冲区函数(buffer functions)
 
 - 从 src[offset] 加载已经确定类型的值
@@ -204,6 +239,76 @@ void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets);
 void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset);
 ```
 
+# 整数缓冲区函数(integer buffer functions)
+
+- 从 src[offset] 加载整数类型的值
+
+```c
+aint8 i8buffer_ld1(sint8 src, int offset);
+aint8vec4 i8buffer_ld4(sint8vec4 src, int offset);
+int i8buffer_sm4(sint8vec4 src, int offset);
+int i16buffer_ld1(sint16 src, int offset);
+ivec2 i16buffer_ld2(sint16 src, int offset);
+sint16vec4 i16buffer_sm4(sint16vec4 src, int offset);
+aint16vec4 i16buffer_ld4(sint16vec4 src, int offset);
+aint16 lint162aint16(lint16 v);
+aint16vec4 lint162aint16vec4(lint16vec4 v);
+```
+
+`i8buffer_sm4` 加载四个 int8 lane 的原始 packed `int` 表示。它适用于 shared memory 暂存和 `dotPacked4x8EXT` 路径，避免先解包成 `ivec4` 再重新打包。
+
+`i16buffer_ld1` 和 `i16buffer_ld2` 将有符号 int16 lane 加载为 `int` 和 `ivec2`。没有原生 int16 storage 时，`offset` 仍表示逻辑 int16 lane 偏移，packed storage 会把相邻两个 lane 放在一个 `int` 中。
+
+`i16buffer_sm4` 从 buffer storage 加载四个逻辑 int16 lane 的原始 `sint16vec4` 表示。`i16buffer_ld4` 从 buffer storage 将四个逻辑 int16 lane 加载为 `aint16vec4`。`lint162aint16` 和 `lint162aint16vec4` 将 shared/local int16 值转换为算术 int16 值。
+
+- 将整数类型的值存储到 dst[offset]
+
+```c
+void i8buffer_st1(sint8 dst, int offset, aint8 v);
+void i8buffer_st4(sint8vec4 dst, int offset, aint8vec4 v);
+void i16buffer_st1(sint16 dst, int offset, int v);
+void i16buffer_st2(sint16 dst, int offset, ivec2 v);
+void i16buffer_st4(sint16vec4 dst, int offset, ivec4 v);
+void i16buffer_st4(lint16vec4 dst, int offset, ivec4 v);
+```
+
+没有原生 int8 storage 时，`i8buffer_st1` 会更新 packed `int` 中的一个 byte lane，可能使用 atomic compare-and-swap 循环。
+
+没有原生 int16 storage 时，`i16buffer_st1` 会更新 packed `int` 中的一个 int16 lane，可能使用 atomic compare-and-swap 循环。`i16buffer_st2` 在 `offset` 对齐时会直接写入完整 packed word。`i16buffer_st4` 将四个逻辑 int16 lane 写入 `sint16vec4` 存储或 `lint16vec4` shared/local memory。
+
+- 从 src[src_offset] 的 int8 类型值拷贝到 dst[dst_offset]
+
+```c
+void i8buffer_cp1(sint8 dst, int dst_offset, sint8 src, int src_offset);
+void i8buffer_cp4(sint8vec4 dst, int dst_offset, sint8vec4 src, int src_offset);
+```
+
+- 从 src[src_offsets[0],src_offsets[1],...] 的 int8 类型值拷贝并打包到 dst[dst_offset]
+
+```c
+void i8buffer_cp1to4(sint8vec4 dst, int dst_offset, sint8 src, ivec4 src_offsets);
+```
+
+- 从 src[src_offset] 的 int8 类型值拷贝并解包到 dst[dst_offsets[0],dst_offsets[1],...]
+
+```c
+void i8buffer_cp4to1(sint8 dst, ivec4 dst_offsets, sint8vec4 src, int src_offset);
+```
+
+- 打包和解包有符号整数 lane
+
+```c
+ivec4 unpackInt4x8(int v);
+int packInt4x8(ivec4 v);
+ivec2 unpackInt2x16(int v);
+int packInt2x16(ivec2 v);
+int float2int8(float v);
+ivec4 float2int8vec4(vec4 v);
+```
+
+`packInt4x8` 将 `.r/.g/.b/.a` 按低字节到高字节保存到一个 `int`。`packInt2x16` 将 `.r/.g` 按低 16-bit lane 到高 16-bit lane 保存到一个 `int`。
+`float2int8` 和 `float2int8vec4` 使用 half-away-from-zero 规则，并饱和到 [-127, 127]，用于确定性的 int8 量化。
+
 # 本地数据转换函数
 
 - 存储缓冲区转换到本地内存
@@ -315,6 +420,16 @@ void main()
     // 为 subgroup_size == 32 优化的代码路径
 #endif
 
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+    // packed int8 dot-product 路径
+#endif
+
+#if ncnn_VK_KHR_cooperative_matrix
+    // KHR cooperative matrix 路径
+#elif ncnn_VK_NV_cooperative_matrix
+    // NV cooperative matrix 路径
+#endif
+
     // 使用宏定义
     uint size; // 来自先前例程的动态值
     if (size < ncnn_subgroupSize)
@@ -330,6 +445,12 @@ void main()
 }
 ```
 
+Cooperative matrix 的形状和 component type 组合在 host 侧选择。创建 cooperative matrix pipeline 前，应使用 `GpuInfo::support_cooperative_matrix()`、`GpuInfo::support_int8_cooperative_matrix()`、`GpuInfo::support_bf16_cooperative_matrix()` 和 `GpuInfo::get_optimal_cooperative_matrix_mnk()` 判断能力并选择参数。
+
+对于有符号 int8 cooperative matrix kernel，ncnn 要求设备支持 subgroup scope 下的 signed int8 A/B 和 signed int32 accumulator/result cooperative matrix。shader 仍然使用普通的 `ncnn_VK_KHR_cooperative_matrix` / `ncnn_VK_NV_cooperative_matrix` 扩展宏选择 GLSL 语法，host 侧通过 `support_int8_cooperative_matrix()` 选择该路径。
+
+在 int8 cooperative matrix 和 integer dot-product shader 中，如果数据布局已经是 packed 形式，应优先保持 `sint8vec4` 的 packed 表示，并使用 `i8buffer_sm4` 做 shared memory 暂存。只有在算术逻辑需要解包后的 `ivec4` lane 时才使用 `i8buffer_ld4`。
+
 ### 验证层宏定义
 
 当启用 vulkan 验证层时，ncnn 会定义一些额外的便捷宏
@@ -358,7 +479,7 @@ void main()
 
 仅当用户启用某些选项时才启用 GLSL 扩展
 
-`GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage` 或 `opt.use_bf16_storage` 选项时，自动启用，无需显式代码指示。
+`GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage`、`opt.use_bf16_storage` 或 `opt.use_int16_storage` 选项时，自动启用，无需显式代码指示。
 
 `GL_EXT_shader_explicit_arithmetic_types_float16` 扩展会在设备支持 16 位算术运算且用户开启了 `opt.use_fp16_arithmetic` 选项时，自动启用，无需显式代码指示。
 
@@ -389,6 +510,10 @@ void main()
 |NCNN_int8_packed|opt.use_int8_packed|
 |NCNN_int8_storage|opt.use_int8_storage|
 |NCNN_int8_arithmetic|opt.use_int8_arithmetic|
+|NCNN_int16_packed|opt.use_int16_packed|
+|NCNN_int16_storage|opt.use_int16_storage|
 |NCNN_bf16_packed|opt.use_bf16_packed|
 |NCNN_bf16_storage|opt.use_bf16_storage|
+|NCNN_fp16_uniform|opt.use_fp16_uniform|
+|NCNN_int8_uniform|opt.use_int8_uniform|
 |NCNN_shader_local_memory|opt.use_shader_local_memory|
diff --git a/src/c_api.cpp b/src/c_api.cpp
index efe8b771368a..4e7b0991f2ce 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -334,6 +334,16 @@ int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt)
     return ((const Option*)opt)->use_int8_arithmetic;
 }
 
+int ncnn_option_get_use_int16_packed(const ncnn_option_t opt)
+{
+    return ((const Option*)opt)->use_int16_packed;
+}
+
+int ncnn_option_get_use_int16_storage(const ncnn_option_t opt)
+{
+    return ((const Option*)opt)->use_int16_storage;
+}
+
 int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt)
 {
     return ((const Option*)opt)->use_bf16_packed;
@@ -424,6 +434,16 @@ void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable)
     ((Option*)opt)->use_int8_arithmetic = enable;
 }
 
+void ncnn_option_set_use_int16_packed(ncnn_option_t opt, int enable)
+{
+    ((Option*)opt)->use_int16_packed = enable;
+}
+
+void ncnn_option_set_use_int16_storage(ncnn_option_t opt, int enable)
+{
+    ((Option*)opt)->use_int16_storage = enable;
+}
+
 void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable)
 {
     ((Option*)opt)->use_bf16_packed = enable;
diff --git a/src/c_api.h b/src/c_api.h
index 918cced4ded3..d0013c625f8e 100644
--- a/src/c_api.h
+++ b/src/c_api.h
@@ -75,6 +75,8 @@ NCNN_EXPORT int ncnn_option_get_use_fp16_arithmetic(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_int8_packed(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_int8_storage(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt);
+NCNN_EXPORT int ncnn_option_get_use_int16_packed(const ncnn_option_t opt);
+NCNN_EXPORT int ncnn_option_get_use_int16_storage(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_bf16_storage(const ncnn_option_t opt);
 NCNN_EXPORT int ncnn_option_get_use_shader_local_memory(const ncnn_option_t opt);
@@ -91,6 +93,8 @@ NCNN_EXPORT void ncnn_option_set_use_fp16_arithmetic(ncnn_option_t opt, int enab
 NCNN_EXPORT void ncnn_option_set_use_int8_packed(ncnn_option_t opt, int enable);
 NCNN_EXPORT void ncnn_option_set_use_int8_storage(ncnn_option_t opt, int enable);
 NCNN_EXPORT void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable);
+NCNN_EXPORT void ncnn_option_set_use_int16_packed(ncnn_option_t opt, int enable);
+NCNN_EXPORT void ncnn_option_set_use_int16_storage(ncnn_option_t opt, int enable);
 NCNN_EXPORT void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable);
 NCNN_EXPORT void ncnn_option_set_use_bf16_storage(ncnn_option_t opt, int enable);
 NCNN_EXPORT void ncnn_option_set_use_shader_local_memory(ncnn_option_t opt, int enable);
diff --git a/src/gpu.cpp b/src/gpu.cpp
index 5da5c6ccacb1..cbc44ff29456 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -337,6 +337,9 @@ class GpuInfoPrivate
     bool support_cooperative_matrix_16_8_16;
     bool support_cooperative_matrix_16_16_16;
 
+    // int8 cooperative matrix feature
+    bool support_int8_cooperative_matrix;
+
     // bf16 cooperative matrix feature
     bool support_bf16_cooperative_matrix;
 
@@ -1436,6 +1439,7 @@ void GpuInfoPrivate::query_extension_properties()
     support_cooperative_matrix_16_8_8 = false;
     support_cooperative_matrix_16_8_16 = false;
     support_cooperative_matrix_16_16_16 = false;
+    support_int8_cooperative_matrix = false;
     support_bf16_cooperative_matrix = false;
     if (support_VK_KHR_cooperative_matrix && queryCooperativeMatrixFeatures.cooperativeMatrix)
     {
@@ -1493,6 +1497,13 @@ void GpuInfoPrivate::query_extension_properties()
                 support_cooperative_matrix_16_16_16 = true;
             }
 
+            if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR
+                    && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR
+                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+            {
+                support_int8_cooperative_matrix = true;
+            }
+
             if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR
                     && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                     && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
@@ -1556,6 +1567,13 @@ void GpuInfoPrivate::query_extension_properties()
             {
                 support_cooperative_matrix_16_16_16 = true;
             }
+
+            if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV
+                    && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV
+                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+            {
+                support_int8_cooperative_matrix = true;
+            }
         }
     }
 
@@ -1582,12 +1600,6 @@ void GpuInfoPrivate::query_extension_properties()
         {
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret);
         }
-
-        for (uint32_t j = 0; j < propertyCount; j++)
-        {
-            const VkCooperativeMatrixFlexibleDimensionsPropertiesNV& cmfdp = queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j];
-            // NCNN_LOGE("cmfdp %2d %2d %2d  %d %d %d %d  %d %d %d", cmfdp.MGranularity, cmfdp.NGranularity, cmfdp.KGranularity, cmfdp.AType, cmfdp.BType, cmfdp.CType, cmfdp.ResultType, cmfdp.saturatingAccumulation, cmfdp.scope, cmfdp.workgroupInvocations);
-        }
     }
 
     // query supported cooperative vector types and operations
@@ -1613,12 +1625,6 @@ void GpuInfoPrivate::query_extension_properties()
         {
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret);
         }
-
-        for (uint32_t j = 0; j < propertyCount; j++)
-        {
-            const VkCooperativeVectorPropertiesNV& cvp = queryCooperativeVectorSubPropertiesNV[j];
-            // NCNN_LOGE("cvp %d %d %d %d %d  %d", cvp.inputType, cvp.inputInterpretation, cvp.matrixInterpretation, cvp.biasInterpretation, cvp.resultType, cvp.transpose);
-        }
     }
 
     if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_TURNIP)
@@ -1943,6 +1949,21 @@ bool GpuInfo::support_int8_arithmetic() const
     return d->queryFloat16Int8Features.shaderInt8;
 }
 
+bool GpuInfo::support_int16_packed() const
+{
+    return true;
+}
+
+bool GpuInfo::support_int16_storage() const
+{
+    return d->query16BitStorageFeatures.storageBuffer16BitAccess;
+}
+
+bool GpuInfo::support_int16_arithmetic() const
+{
+    return d->physicalDevicefeatures.shaderInt16;
+}
+
 bool GpuInfo::support_bf16_packed() const
 {
     return true;
@@ -1998,6 +2019,11 @@ bool GpuInfo::support_cooperative_matrix_16_16_16() const
     return d->support_cooperative_matrix_16_16_16;
 }
 
+bool GpuInfo::support_int8_cooperative_matrix() const
+{
+    return d->support_int8_cooperative_matrix && support_int8_arithmetic();
+}
+
 bool GpuInfo::support_bf16_cooperative_matrix() const
 {
     return d->support_bf16_cooperative_matrix;
@@ -3554,6 +3580,8 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
     opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
     opt.use_int8_packed = use_int8; // int8p is always supported
     opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
+    opt.use_int16_packed = false;
+    opt.use_int16_storage = false;
     opt.use_bf16_packed = use_bf16; // bf16p is always supported
     opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage();
 
@@ -3633,6 +3661,8 @@ void VulkanDevicePrivate::destroy_utility_operator()
             opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
             opt.use_int8_packed = false;
             opt.use_int8_storage = false;
+            opt.use_int16_packed = false;
+            opt.use_int16_storage = false;
             opt.use_bf16_packed = false;
             opt.use_bf16_storage = false;
 
@@ -3660,6 +3690,8 @@ void VulkanDevicePrivate::destroy_utility_operator()
         opt.use_fp16_storage = false;
         opt.use_int8_packed = use_int8;
         opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
+        opt.use_int16_packed = false;
+        opt.use_int16_storage = false;
         opt.use_bf16_packed = false;
         opt.use_bf16_storage = false;
 
@@ -3689,6 +3721,8 @@ void VulkanDevicePrivate::destroy_utility_operator()
         opt.use_fp16_storage = false;
         opt.use_int8_packed = false;
         opt.use_int8_storage = false;
+        opt.use_int16_packed = false;
+        opt.use_int16_storage = false;
         opt.use_bf16_packed = use_bf16;
         opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage();
 
@@ -5097,6 +5131,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
     const GpuInfo& info = get_gpu_info(device_index);
     const bool support_fp16_storage = info.support_fp16_storage();
     const bool support_fp16_uniform = info.support_fp16_uniform();
+    const bool support_int16_arithmetic = info.physicalDevicefeatures().shaderInt16;
 
     if (opt.use_bf16_storage)
     {
@@ -5448,6 +5483,42 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.append("sint8", "int");
     }
 
+    if (opt.use_int16_storage)
+    {
+        custom_defines.append("NCNN_int16_storage", 1);
+        custom_defines.append("sint16", "int16_t");
+        custom_defines.append("sint16vec4", "i16vec4");
+        custom_defines.append("lint16", "int16_t");
+        custom_defines.append("lint16vec4", "i16vec4");
+        custom_defines.append("aint16", support_int16_arithmetic ? "int16_t" : "int");
+        custom_defines.append("aint16vec4", support_int16_arithmetic ? "i16vec4" : "ivec4");
+        custom_defines.append("lint162aint16(v)", support_int16_arithmetic ? "v" : "int(v)");
+        custom_defines.append("lint162aint16vec4(v)", support_int16_arithmetic ? "v" : "ivec4(v)");
+    }
+    else if (opt.use_int16_packed)
+    {
+        custom_defines.append("NCNN_int16_packed", 1);
+        custom_defines.append("sint16", "int");
+        custom_defines.append("sint16vec4", "ivec2");
+        custom_defines.append("lint16", "int");
+        custom_defines.append("lint16vec4", "ivec2");
+        custom_defines.append("aint16", support_int16_arithmetic ? "int16_t" : "int");
+        custom_defines.append("aint16vec4", support_int16_arithmetic ? "i16vec4" : "ivec4");
+        custom_defines.append("lint162aint16(v)", support_int16_arithmetic ? "int16_t(v)" : "int(v)");
+        custom_defines.append("lint162aint16vec4(v)", support_int16_arithmetic ? "i16vec4(unpack16(v.r),unpack16(v.g))" : "ivec4(unpackInt2x16(v.r),unpackInt2x16(v.g))");
+    }
+    else
+    {
+        custom_defines.append("sint16", "int");
+        custom_defines.append("sint16vec4", "ivec4");
+        custom_defines.append("lint16", "int");
+        custom_defines.append("lint16vec4", "ivec4");
+        custom_defines.append("aint16", "int");
+        custom_defines.append("aint16vec4", "ivec4");
+        custom_defines.append("lint162aint16(v)", "v");
+        custom_defines.append("lint162aint16vec4(v)", "v");
+    }
+
     custom_defines.append("sint8vec4", "int");
 
     custom_defines.append("aint8", "int");
@@ -5455,6 +5526,10 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
 
     custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)");
     custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))");
+    custom_defines.append("unpackInt2x16(v)", "ivec2((int(v)<<16)>>16,int(v)>>16)");
+    custom_defines.append("packInt2x16(v)", "int((uint(v.r)&0xFFFFu)|((uint(v.g)&0xFFFFu)<<16))");
+    custom_defines.append("float2int8(v)", "int(clamp(float(v)+(float(v)>=0.f?0.5f:-0.5f),-127.f,127.f))");
+    custom_defines.append("float2int8vec4(v)", "ivec4(clamp(vec4(v)+mix(vec4(-0.5f),vec4(0.5f),greaterThanEqual(vec4(v),vec4(0.f))),vec4(-127.f),vec4(127.f)))");
 
     if (opt.use_int8_storage)
     {
@@ -5470,8 +5545,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
     }
 
     custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])");
+    custom_defines.append("i8buffer_sm4(buf,i)", "buf[i]");
     custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}");
     custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
+    custom_defines.append("i8buffer_cp1to4(buf,i,sbuf,si)", "{ivec4 _v=ivec4(i8buffer_ld1(sbuf,si.r),i8buffer_ld1(sbuf,si.g),i8buffer_ld1(sbuf,si.b),i8buffer_ld1(sbuf,si.a));i8buffer_st4(buf,i,_v);}");
+    custom_defines.append("i8buffer_cp4to1(buf,i4,sbuf,si)", "{ivec4 _v=i8buffer_ld4(sbuf,si);i8buffer_st1(buf,i4.r,_v.r);i8buffer_st1(buf,i4.g,_v.g);i8buffer_st1(buf,i4.b,_v.b);i8buffer_st1(buf,i4.a,_v.a);}");
+
+    if (opt.use_int16_storage)
+    {
+        custom_defines.append("i16buffer_ld1(buf,i)", "int(buf[i])");
+        custom_defines.append("i16buffer_st1(buf,i,v)", "{buf[i]=int16_t(v);}");
+    }
+    else if (opt.use_int16_packed)
+    {
+        custom_defines.append("i16buffer_ld1(buf,i)", "unpackInt2x16(buf[(i)/2])[(i)%2]");
+        custom_defines.append("i16buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);ivec2 _v=unpackInt2x16(_old_v);_v[_im2]=_vs;_new_v=packInt2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");
+    }
+    else
+    {
+        custom_defines.append("i16buffer_ld1(buf,i)", "int(buf[i])");
+        custom_defines.append("i16buffer_st1(buf,i,v)", "{buf[i]=int(v);}");
+    }
+    custom_defines.append("i16buffer_ld2(buf,i)", "ivec2(i16buffer_ld1(buf,i),i16buffer_ld1(buf,(i)+1))");
+    if (opt.use_int16_storage)
+    {
+        custom_defines.append("i16buffer_st2(buf,i,v)", "{ivec2 _v=ivec2(v);buf[i]=int16_t(_v.r);buf[(i)+1]=int16_t(_v.g);}");
+        custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]");
+        custom_defines.append("i16buffer_ld4(buf,i)", support_int16_arithmetic ? "buf[i]" : "ivec4(buf[i])");
+        custom_defines.append("i16buffer_st4(buf,i,v)", "{buf[i]=i16vec4(v);}");
+    }
+    else if (opt.use_int16_packed)
+    {
+        custom_defines.append("i16buffer_st2(buf,i,v)", "{uint _i=uint(i);ivec2 _v=ivec2(v);if((_i&1u)==0u){buf[_i/2]=packInt2x16(_v);}else{i16buffer_st1(buf,int(_i),_v.r);i16buffer_st1(buf,int(_i)+1,_v.g);}}");
+        custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]");
+        custom_defines.append("i16buffer_ld4(buf,i)", support_int16_arithmetic ? "i16vec4(unpack16(buf[i].r),unpack16(buf[i].g))" : "ivec4(unpackInt2x16(buf[i].r),unpackInt2x16(buf[i].g))");
+        custom_defines.append("i16buffer_st4(buf,i,v)", "{ivec4 _v=ivec4(v);buf[i]=ivec2(packInt2x16(ivec2(_v.r,_v.g)),packInt2x16(ivec2(_v.b,_v.a)));}");
+    }
+    else
+    {
+        custom_defines.append("i16buffer_st2(buf,i,v)", "{ivec2 _v=ivec2(v);buf[i]=int(_v.r);buf[(i)+1]=int(_v.g);}");
+        custom_defines.append("i16buffer_sm4(buf,i)", "buf[i]");
+        custom_defines.append("i16buffer_ld4(buf,i)", "ivec4(buf[i])");
+        custom_defines.append("i16buffer_st4(buf,i,v)", "{buf[i]=ivec4(v);}");
+    }
 
     custom_defines.append("psc(x)", "(x==0?p.x:x)");
 
@@ -6103,7 +6219,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
     {
         custom_exts += "#extension GL_EXT_bfloat16: require\n";
     }
-    if (opt.use_fp16_storage || opt.use_bf16_storage)
+    if (opt.use_fp16_storage || opt.use_bf16_storage || opt.use_int16_storage)
     {
         custom_exts += "#extension GL_EXT_shader_16bit_storage: require\n";
     }
diff --git a/src/gpu.h b/src/gpu.h
index aec8b033fe0a..e313a1a5a2b9 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -278,7 +278,7 @@ class NCNN_EXPORT GpuInfo
     // but sometimes bug is a feature
     bool bug_implicit_fp16_arithmetic() const;
 
-    // fp16 and int8 feature
+    // fp16/int8/int16 feature
     bool support_fp16_packed() const;
     bool support_fp16_storage() const;
     bool support_fp16_uniform() const;
@@ -287,6 +287,10 @@ class NCNN_EXPORT GpuInfo
     bool support_int8_storage() const;
     bool support_int8_uniform() const;
     bool support_int8_arithmetic() const;
+    bool support_int16_packed() const;
+    // storage only; pair with support_int16_arithmetic() for shader int16 type
+    bool support_int16_storage() const;
+    bool support_int16_arithmetic() const;
 
     // bf16 feature
     bool support_bf16_packed() const;
@@ -309,6 +313,9 @@ class NCNN_EXPORT GpuInfo
     bool support_cooperative_matrix_16_8_16() const;
     bool support_cooperative_matrix_16_16_16() const;
 
+    // int8 cooperative matrix feature
+    bool support_int8_cooperative_matrix() const;
+
     // bf16 cooperative matrix feature
     bool support_bf16_cooperative_matrix() const;
 
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index 04566e616edb..e82c25cedaf8 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -326,6 +326,13 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
         return forward_int8_arm(bottom_blob, top_blob, opt);
     }
 #endif
@@ -333,6 +340,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index 8533ee0b0c0e..991180aa93f3 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -188,6 +188,13 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
         return forward_int8(bottom_blob, top_blob, opt);
     }
 #endif
@@ -195,6 +202,9 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         int num_input = weight_data_size / num_output;
         if (bottom_blob.w * bottom_blob.elempack == num_input)
         {
@@ -373,6 +383,13 @@ static inline signed char float2int8(float v)
 
 int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
+    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+    {
+        NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+        return -1;
+    }
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
index 612ff8122edb..0f024d3c2531 100644
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -444,6 +444,13 @@ int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
 #if NCNN_BF16
         if (opt.use_bf16_storage && bottom_blob.elembits() == 16)
         {
@@ -469,6 +476,9 @@ int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
@@ -1085,6 +1095,9 @@ int Convolution_loongarch::forward_bf16s(const Mat& bottom_blob, Mat& top_blob,
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index 9655682afad3..f203c4345888 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -382,6 +382,13 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
 #if NCNN_BF16
         if (opt.use_bf16_storage && bottom_blob.elembits() == 16)
         {
@@ -407,6 +414,9 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
@@ -1019,6 +1029,9 @@ int Convolution_mips::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index a45e8bc6223a..c51d673883fa 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -201,6 +201,13 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
         Mat bottom_blob_unpacked = bottom_blob;
         if (bottom_blob.elempack != 1)
         {
@@ -232,6 +239,9 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index cd35b9b451bb..aab3d049e03e 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -5,6 +5,7 @@
 
 #include "layer_shader_type.h"
 #include "layer_type.h"
+#include "modelbin.h"
 
 namespace ncnn {
 
@@ -17,7 +18,6 @@ Convolution_vulkan::Convolution_vulkan()
 
     pipeline_convolution = 0;
     pipeline_convolution_1x1s1d1 = 0;
-
     pipeline_convolution_gemm = 0;
 
     pipeline_convolution_3x3s1d1_winograd23_transform_input = 0;
@@ -32,6 +32,11 @@ Convolution_vulkan::Convolution_vulkan()
     reshape_w = 0;
 
     use_cooperative_matrix = false;
+#if NCNN_INT8
+    quantize = 0;
+    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = 0;
+    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = 0;
+#endif
     coopmat_M = 0;
     coopmat_N = 0;
     coopmat_K = 0;
@@ -52,11 +57,26 @@ int Convolution_vulkan::load_param(const ParamDict& pd)
         support_vulkan = false;
     }
 
+#if NCNN_INT8
+    if (int8_scale_term && pad_value != 0.f)
+    {
+        NCNN_LOGE("Convolution_vulkan int8 nonzero pad value is not supported");
+        support_vulkan = false;
+    }
+#endif
+
     return ret;
 }
 
 int Convolution_vulkan::create_pipeline(const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return create_pipeline_int8(opt);
+    }
+#endif
+
     Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
 
@@ -1497,6 +1517,13 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt)
     pipeline_convolution_3x3s1d1_winograd43_gemm = 0;
     pipeline_convolution_3x3s1d1_winograd43_transform_output = 0;
 
+#if NCNN_INT8
+    delete pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm;
+    delete pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm;
+    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = 0;
+    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = 0;
+#endif
+
     // fc
     if (reshape_1x1xw)
     {
@@ -1513,6 +1540,14 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt)
     }
 
     use_cooperative_matrix = false;
+#if NCNN_INT8
+    if (quantize)
+    {
+        quantize->destroy_pipeline(opt);
+        delete quantize;
+        quantize = 0;
+    }
+#endif
     coopmat_M = 0;
     coopmat_N = 0;
     coopmat_K = 0;
@@ -1528,6 +1563,13 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt)
 
 int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return upload_model_int8(cmd, opt);
+    }
+#endif
+
     if (padding)
     {
         padding->upload_model(cmd, opt);
@@ -1575,6 +1617,13 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
 int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return forward_int8(bottom_blob, top_blob, cmd, opt);
+    }
+#endif
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
@@ -1584,6 +1633,9 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         int num_input = weight_data_size / num_output;
         if (bottom_blob.w * bottom_blob.elempack == num_input)
         {
@@ -2101,4 +2153,1837 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     return 0;
 }
 
+#if NCNN_INT8
+int Convolution_vulkan::create_pipeline_int8(const Option& opt)
+{
+    Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    // skip fc like hint
+    if (shape.dims != 3) shape = Mat();
+    if (out_shape.dims != 3) out_shape = Mat();
+
+    if (weight_data.elemsize != (size_t)1u)
+    {
+        NCNN_LOGE("Convolution_vulkan int8 weight data is not int8");
+        return -1;
+    }
+
+    Option opt_int8 = opt;
+    opt_int8.use_fp16_arithmetic = false;
+    opt_int8.use_int16_packed = false;
+    opt_int8.use_int16_storage = false;
+    const bool use_int8_requantize = int8_scale_term > 100;
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+    bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16;
+    bool use_winograd43 = use_winograd && opt.use_winograd43_convolution;
+    bool use_winograd23 = use_winograd && opt.use_winograd23_convolution;
+    bool use_gemm = opt.use_sgemm_convolution && !is_conv1x1s1d1 && !use_winograd && num_input * maxk >= 8 && num_output >= 8;
+    const int elempack = opt.use_packing_layout && num_input % 4 == 0 ? 4 : 1;
+    const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
+
+    Mat shape_int8;
+    if (shape.dims == 3)
+    {
+        shape_int8 = Mat(shape.w, shape.h, num_input / elempack, (void*)0, (size_t)elempack, elempack);
+    }
+
+    Mat shape_int8_bordered;
+    if (shape_int8.dims == 3)
+    {
+        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+        {
+            shape_int8_bordered = Mat(shape_int8.w + pad_left + pad_right, shape_int8.h + pad_top + pad_bottom, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack);
+        }
+        else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+                 || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
+        {
+            const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+            const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+            int wpad = kernel_extent_w + (shape_int8.w - 1) / stride_w * stride_w - shape_int8.w;
+            int hpad = kernel_extent_h + (shape_int8.h - 1) / stride_h * stride_h - shape_int8.h;
+            if (wpad > 0 || hpad > 0)
+            {
+                shape_int8_bordered = Mat(shape_int8.w + wpad, shape_int8.h + hpad, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack);
+            }
+            else
+            {
+                shape_int8_bordered = shape_int8;
+            }
+        }
+        else
+        {
+            shape_int8_bordered = shape_int8;
+        }
+    }
+
+    if (shape_int8_bordered.dims == 3 && use_winograd43 && use_winograd23)
+    {
+        const int w_bordered = shape_int8_bordered.w;
+        const int h_bordered = shape_int8_bordered.h;
+
+        bool prefer_winograd43 = true;
+        if (vkdev->info.type() == 0 && ((w_bordered <= 18 && h_bordered <= 18) || ((w_bordered >= 23 && w_bordered <= 24) && (h_bordered >= 23 && h_bordered <= 24))))
+            prefer_winograd43 = false;
+        if (vkdev->info.type() != 0 && (w_bordered <= 12 && h_bordered <= 12))
+            prefer_winograd43 = false;
+
+        use_winograd43 = prefer_winograd43;
+        use_winograd23 = !prefer_winograd43;
+    }
+
+    Mat shape_padding_int8_bordered;
+    if (shape_int8_bordered.dims == 3)
+    {
+        const int padding_outc = shape_int8_bordered.c * shape_int8_bordered.elempack;
+        const int padding_out_elempack = padding_outc % 4 == 0 ? 4 : 1;
+        const size_t padding_out_elemsize = shape_int8_bordered.elemsize / shape_int8_bordered.elempack * padding_out_elempack;
+        shape_padding_int8_bordered = Mat(shape_int8_bordered.w, shape_int8_bordered.h, padding_outc / padding_out_elempack, (void*)0, padding_out_elemsize, padding_out_elempack);
+    }
+
+    Mat out_shape_blob;
+    if (out_shape.dims == 3)
+    {
+        size_t out_elemsize;
+        if (use_int8_requantize)
+        {
+            out_elemsize = out_elempack;
+        }
+        else if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+        {
+            out_elemsize = (size_t)2u * out_elempack;
+        }
+        else
+        {
+            out_elemsize = (size_t)4u * out_elempack;
+        }
+
+        out_shape_blob = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack);
+    }
+
+    {
+        quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize);
+        quantize->vkdev = vkdev;
+
+        Mat shape_quantize;
+        Mat out_shape_quantize;
+        if (shape.dims == 3)
+        {
+            size_t shape_elemsize = shape.elemsize;
+            if (shape.elempack != 0)
+                shape_elemsize = shape.elemsize / shape.elempack * elempack;
+
+            shape_quantize = Mat(shape.w, shape.h, num_input / elempack, (void*)0, shape_elemsize, elempack);
+            out_shape_quantize = shape_int8;
+        }
+
+        quantize->bottom_shapes.resize(1);
+        quantize->bottom_shapes[0] = shape_quantize;
+        quantize->top_shapes.resize(1);
+        quantize->top_shapes[0] = out_shape_quantize;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 1);
+        quantize->load_param(pd);
+
+        Mat weights[1];
+        weights[0] = bottom_blob_int8_scales;
+        quantize->load_model(ModelBinFromMatArray(weights));
+
+        Option opt_quantize = opt;
+        opt_quantize.use_fp16_arithmetic = false;
+
+        quantize->create_pipeline(opt_quantize);
+    }
+
+    {
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
+        padding->vkdev = vkdev;
+
+        padding->bottom_shapes.resize(1);
+        padding->bottom_shapes[0] = shape_int8;
+        padding->top_shapes.resize(1);
+        padding->top_shapes[0] = shape_padding_int8_bordered;
+
+        ncnn::ParamDict pd;
+        pd.set(0, pad_top);
+        pd.set(1, pad_bottom);
+        pd.set(2, pad_left);
+        pd.set(3, pad_right);
+        pd.set(4, 0);
+        pd.set(5, 0.f);
+
+        padding->load_param(pd);
+
+        padding->create_pipeline(opt);
+    }
+
+    const int num_input_packed = (num_input + 3) / 4 * 4;
+    const int num_output_packed = (num_output + 3) / 4 * 4;
+    const int c_packed = num_input_packed / 4;
+    const int outc_pack4 = num_output_packed / 4;
+    const int c_shader = use_gemm || elempack == 4 ? c_packed : num_input;
+
+    std::vector<vk_specialization_type> specializations(13 + 10);
+    specializations[0].i = kernel_w;
+    specializations[1].i = kernel_h;
+    specializations[2].i = dilation_w;
+    specializations[3].i = dilation_h;
+    specializations[4].i = stride_w;
+    specializations[5].i = stride_h;
+    specializations[6].i = bias_term;
+    specializations[7].i = activation_type;
+    specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[10].i = use_int8_requantize ? 1 : 0;
+    specializations[11].i = elempack;
+    specializations[12].i = out_elempack;
+    specializations[13 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0;
+    specializations[13 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0;
+    specializations[13 + 2].i = shape_int8_bordered.dims != 0 ? c_shader : 0;
+    specializations[13 + 3].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0;
+    specializations[13 + 4].i = out_shape_blob.dims != 0 ? out_shape_blob.w : 0;
+    specializations[13 + 5].i = out_shape_blob.dims != 0 ? out_shape_blob.h : 0;
+    specializations[13 + 6].i = out_shape_blob.dims != 0 ? outc_pack4 : 0;
+    specializations[13 + 7].i = out_shape_blob.dims != 0 ? (out_elempack == 4 ? out_shape_blob.cstep : out_shape_blob.cstep * 4) : 0;
+    specializations[13 + 8].i = num_output;
+    specializations[13 + 9].i = num_input;
+
+    const bool use_int8_winograd_int16_storage = use_winograd && opt.use_int16_storage && vkdev->info.support_int16_storage() && vkdev->info.support_int16_arithmetic();
+    if (use_winograd)
+    {
+        if (use_int8_winograd_int16_storage)
+            opt_int8.use_int16_storage = true;
+        else
+            opt_int8.use_int16_packed = true;
+    }
+
+    use_cooperative_matrix = false;
+    coopmat_M = 0;
+    coopmat_N = 0;
+    coopmat_K = 0;
+    coopmat_subgroup_size = 0;
+
+    if (use_winograd && opt.use_cooperative_matrix && vkdev->info.support_int8_cooperative_matrix())
+    {
+        int M = 1024;
+        if (out_shape.dims == 3)
+        {
+            const int block_x = use_winograd43 ? (out_shape.w + 3) / 4 : (out_shape.w + 1) / 2;
+            const int block_y = use_winograd43 ? (out_shape.h + 3) / 4 : (out_shape.h + 1) / 2;
+            M = block_x * block_y;
+        }
+
+        const int N = num_output;
+        const int K = num_input;
+
+        coopmat_subgroup_size = vkdev->info.querySubgroupProperties().subgroupSize;
+
+        double min_cost = 1e300;
+
+        if (vkdev->info.support_VK_KHR_cooperative_matrix() && vkdev->info.queryCooperativeMatrixFeatures().cooperativeMatrix)
+        {
+            const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = vkdev->info.queryCooperativeMatrixSubProperties();
+            for (size_t i = 0; i < properties.size(); i++)
+            {
+                const VkCooperativeMatrixPropertiesKHR& cmp = properties[i];
+                if (cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR && cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR
+                        && cmp.CType == VK_COMPONENT_TYPE_SINT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_SINT32_KHR
+                        && cmp.scope == VK_SCOPE_SUBGROUP_KHR
+                        && cmp.MSize % 4 == 0 && cmp.NSize % 4 == 0 && cmp.KSize % 4 == 0)
+                {
+                    const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize;
+                    const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize;
+                    const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize;
+
+                    double cost = (double)M_pad * N_pad * K_pad - (double)M * N * K;
+                    if (cost < min_cost)
+                    {
+                        min_cost = cost;
+                        coopmat_M = cmp.MSize;
+                        coopmat_N = cmp.NSize;
+                        coopmat_K = cmp.KSize;
+                    }
+                }
+            }
+        }
+        else if (vkdev->info.support_VK_NV_cooperative_matrix() && vkdev->info.queryCooperativeMatrixFeaturesNV().cooperativeMatrix)
+        {
+            const std::vector<VkCooperativeMatrixPropertiesNV>& properties = vkdev->info.queryCooperativeMatrixSubPropertiesNV();
+            for (size_t i = 0; i < properties.size(); i++)
+            {
+                const VkCooperativeMatrixPropertiesNV& cmp = properties[i];
+                if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV
+                        && cmp.CType == VK_COMPONENT_TYPE_SINT32_NV && cmp.DType == VK_COMPONENT_TYPE_SINT32_NV
+                        && cmp.scope == VK_SCOPE_SUBGROUP_NV
+                        && cmp.MSize % 4 == 0 && cmp.NSize % 4 == 0 && cmp.KSize % 4 == 0)
+                {
+                    const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize;
+                    const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize;
+                    const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize;
+
+                    double cost = (double)M_pad * N_pad * K_pad - (double)M * N * K;
+                    if (cost < min_cost)
+                    {
+                        min_cost = cost;
+                        coopmat_M = cmp.MSize;
+                        coopmat_N = cmp.NSize;
+                        coopmat_K = cmp.KSize;
+                    }
+                }
+            }
+        }
+
+        if (coopmat_M != 0 && coopmat_N != 0 && coopmat_K != 0)
+        {
+            use_cooperative_matrix = true;
+
+            UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2);
+            UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2);
+            UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2);
+
+            UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2);
+            UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2);
+        }
+    }
+    else if ((is_conv1x1s1d1 || use_gemm) && opt.use_cooperative_matrix && opt.use_int8_arithmetic && vkdev->info.support_int8_cooperative_matrix())
+    {
+        const int M = out_shape.dims == 3 ? out_shape.w * out_shape.h : 1024;
+        const int N = num_output;
+        const int K = is_conv1x1s1d1 ? num_input : num_input * maxk;
+
+        if (N >= 8 && K >= 8)
+        {
+            vkdev->info.get_optimal_cooperative_matrix_mnk(M, N, K, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR, VK_SCOPE_SUBGROUP_KHR, coopmat_M, coopmat_N, coopmat_K, coopmat_subgroup_size);
+        }
+
+        if (coopmat_M != 0 && coopmat_N != 0 && coopmat_K != 0)
+        {
+            use_cooperative_matrix = true;
+
+            UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2);
+            UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2);
+            UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2);
+
+            UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2);
+            UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2);
+        }
+    }
+
+    if (is_conv1x1s1d1)
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        if (use_cooperative_matrix)
+        {
+            const signed char* weight_data_ptr = weight_data;
+
+            const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+            const int kk = (num_input + coopmat_K - 1) / coopmat_K;
+            const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+            const int coopmat_Nd4 = coopmat_N / 4;
+            const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0);
+
+            const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded;
+            weight_data_int8_packed.create(weight_data_int8_packed_size, blocks_n, (size_t)4u, 4);
+            if (weight_data_int8_packed.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int bn = 0; bn < blocks_n; bn++)
+            {
+                signed char* p = weight_data_int8_packed.row<signed char>(bn);
+
+                for (int k = 0; k < kk_padded; k += UNROLL_SG_K)
+                {
+                    for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                    {
+                        for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                        {
+                            for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                            {
+                                for (int i = 0; i < coopmat_K; i++)
+                                {
+                                    for (int j = 0; j < coopmat_Nd4p; j++)
+                                    {
+                                        for (int jj = 0; jj < 4; jj++)
+                                        {
+                                            const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj;
+                                            const int gki = (k + zk) * coopmat_K + i;
+
+                                            *p++ = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_ptr[gni * num_input + gki] : 0;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8;
+            const int num_output_packed = num_output_pack4_aligned * 4;
+
+            weight_data_int8_packed.create(maxk, num_input_packed / 4, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4);
+
+            for (int q = 0; q < num_output_packed; q += 4)
+            {
+                signed char* g00 = weight_data_int8_packed.channel(q / 4);
+
+                for (int p = 0; p < num_input_packed; p += 4)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        for (int i = 0; i < 4; i++)
+                        {
+                            for (int j = 0; j < 4; j++)
+                            {
+                                if (q + i < num_output && p + j < num_input)
+                                {
+                                    const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
+                                    g00[0] = k00[k];
+                                }
+                                else
+                                {
+                                    g00[0] = 0;
+                                }
+                                g00++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        if (use_gemm)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+            if (use_cooperative_matrix)
+            {
+                const signed char* weight_data_ptr = weight_data;
+
+                const int K = num_input * maxk;
+                const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+                const int kk = (K + coopmat_K - 1) / coopmat_K;
+                const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+                const int coopmat_Nd4 = coopmat_N / 4;
+                const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0);
+
+                const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded;
+                weight_data_int8_packed.create(weight_data_int8_packed_size, blocks_n, (size_t)4u, 4);
+                if (weight_data_int8_packed.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int bn = 0; bn < blocks_n; bn++)
+                {
+                    signed char* p = weight_data_int8_packed.row<signed char>(bn);
+
+                    for (int k = 0; k < kk_padded; k += UNROLL_SG_K)
+                    {
+                        for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                        {
+                            for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                            {
+                                for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                                {
+                                    for (int i = 0; i < coopmat_K; i++)
+                                    {
+                                        for (int j = 0; j < coopmat_Nd4p; j++)
+                                        {
+                                            for (int jj = 0; jj < 4; jj++)
+                                            {
+                                                const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj;
+                                                const int gki = (k + zk) * coopmat_K + i;
+
+                                                *p++ = j < coopmat_Nd4 && gni < num_output && gki < K ? weight_data_ptr[gni * K + gki] : 0;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8;
+
+                weight_data_int8_packed.create(maxk, c_packed, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < num_output_pack4_aligned; q++)
+                {
+                    signed char* g00 = weight_data_int8_packed.channel(q);
+
+                    for (int p = 0; p < num_input_packed; p += 4)
+                    {
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            for (int i = 0; i < 4; i++)
+                            {
+                                for (int j = 0; j < 4; j++)
+                                {
+                                    if (q * 4 + i < num_output && p + j < num_input)
+                                    {
+                                        const signed char* k00 = weight_data_r2.channel(q * 4 + i).row<const signed char>(p + j);
+                                        g00[0] = k00[k];
+                                    }
+                                    else
+                                    {
+                                        g00[0] = 0;
+                                    }
+                                    g00++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else if (!use_winograd)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+            const int num_output_pack4_aligned = ((num_output + 3) / 4 + 7) / 8 * 8;
+
+            if (elempack == 4)
+            {
+                weight_data_int8_packed.create(maxk, c_packed, num_output_pack4_aligned, (size_t)4 * 4, 4 * 4);
+            }
+            else
+            {
+                weight_data_int8_packed.create(maxk * num_input, num_output_pack4_aligned, (size_t)4u, 4);
+            }
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < num_output_pack4_aligned; q++)
+            {
+                signed char* g00 = elempack == 4 ? weight_data_int8_packed.channel(q) : weight_data_int8_packed.row<signed char>(q);
+
+                const int num_input_loop = elempack == 4 ? num_input_packed : num_input;
+                for (int p = 0; p < num_input_loop; p += elempack)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        for (int i = 0; i < 4; i++)
+                        {
+                            for (int j = 0; j < elempack; j++)
+                            {
+                                if (q * 4 + i < num_output && p + j < num_input)
+                                {
+                                    const signed char* k00 = weight_data_r2.channel(q * 4 + i).row<const signed char>(p + j);
+                                    g00[0] = k00[k];
+                                }
+                                else
+                                {
+                                    g00[0] = 0;
+                                }
+                                g00++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            weight_data_int8_packed = weight_data.reshape(weight_data_size);
+        }
+    }
+
+    {
+        int num_output_packed = (num_output + 3) / 4 * 4;
+        if (!is_conv1x1s1d1 && !use_winograd && !use_gemm)
+            num_output_packed = (num_output + 7) / 8 * 8;
+
+        const float bottom_blob_int8_scale = bottom_blob_int8_scales.empty() ? 1.f : bottom_blob_int8_scales[0];
+        const float bottom_blob_int8_descale = bottom_blob_int8_scale == 0.f ? 0.f : 1.f / bottom_blob_int8_scale;
+
+        if (is_conv1x1s1d1)
+            weight_data_int8_descales.create(num_output_packed / 4, (size_t)4u * 4, 4);
+        else
+            weight_data_int8_descales.create(num_output_packed, (size_t)4u, 1);
+        if (weight_data_int8_descales.empty())
+            return -100;
+
+        float* outptr = weight_data_int8_descales;
+        for (int q = 0; q < num_output_packed; q += 4)
+        {
+            float scale0 = q + 0 < num_output ? weight_data_int8_scales[q + 0] : 0.f;
+            float scale1 = q + 1 < num_output ? weight_data_int8_scales[q + 1] : 0.f;
+            float scale2 = q + 2 < num_output ? weight_data_int8_scales[q + 2] : 0.f;
+            float scale3 = q + 3 < num_output ? weight_data_int8_scales[q + 3] : 0.f;
+            outptr[0] = scale0 == 0.f ? 0.f : bottom_blob_int8_descale / scale0;
+            outptr[1] = scale1 == 0.f ? 0.f : bottom_blob_int8_descale / scale1;
+            outptr[2] = scale2 == 0.f ? 0.f : bottom_blob_int8_descale / scale2;
+            outptr[3] = scale3 == 0.f ? 0.f : bottom_blob_int8_descale / scale3;
+            outptr += 4;
+        }
+    }
+
+    if (bias_term)
+    {
+        int num_output_packed = (num_output + 3) / 4 * 4;
+        if (!is_conv1x1s1d1 && !use_winograd && !use_gemm)
+            num_output_packed = (num_output + 7) / 8 * 8;
+
+        bias_data_int8_packed.create(num_output_packed, (size_t)4u, 1);
+        if (bias_data_int8_packed.empty())
+            return -100;
+
+        float* outptr = bias_data_int8_packed;
+        for (int q = 0; q < num_output_packed; q += 4)
+        {
+            outptr[0] = q + 0 < num_output ? bias_data[q + 0] : 0.f;
+            outptr[1] = q + 1 < num_output ? bias_data[q + 1] : 0.f;
+            outptr[2] = q + 2 < num_output ? bias_data[q + 2] : 0.f;
+            outptr[3] = q + 3 < num_output ? bias_data[q + 3] : 0.f;
+            outptr += 4;
+        }
+    }
+
+    if (use_winograd)
+    {
+        if (use_winograd43)
+        {
+            Mat weight_data_tm;
+            weight_data_tm.create(36, num_input, num_output);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                for (int q = 0; q < num_input; q++)
+                {
+                    const signed char* kernel0 = (const signed char*)weight_data + p * num_input * 9 + q * 9;
+                    int* kernel_tm0 = weight_data_tm.channel(p).row<int>(q);
+
+                    int tmp[6][3];
+                    for (int m = 0; m < 3; m++)
+                    {
+                        const int r0 = kernel0[0];
+                        const int r1 = kernel0[1];
+                        const int r2 = kernel0[2];
+
+                        tmp[0][m] = r0 * 6;
+                        tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4;
+                        tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4;
+                        tmp[3][m] = r0 + r1 * 2 + r2 * 4;
+                        tmp[4][m] = r0 - r1 * 2 + r2 * 4;
+                        tmp[5][m] = r2 * 6;
+
+                        kernel0 += 3;
+                    }
+
+                    for (int m = 0; m < 6; m++)
+                    {
+                        const int r0 = tmp[m][0];
+                        const int r1 = tmp[m][1];
+                        const int r2 = tmp[m][2];
+
+                        kernel_tm0[m * 6 + 0] = r0 * 6;
+                        kernel_tm0[m * 6 + 1] = -r0 * 4 - r1 * 4 - r2 * 4;
+                        kernel_tm0[m * 6 + 2] = -r0 * 4 + r1 * 4 - r2 * 4;
+                        kernel_tm0[m * 6 + 3] = r0 + r1 * 2 + r2 * 4;
+                        kernel_tm0[m * 6 + 4] = r0 - r1 * 2 + r2 * 4;
+                        kernel_tm0[m * 6 + 5] = r2 * 6;
+                    }
+                }
+            }
+
+            {
+                if (use_cooperative_matrix)
+                {
+                    const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+                    const int kk = (num_input + coopmat_K - 1) / coopmat_K;
+                    const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+                    const int coopmat_Nd4 = coopmat_N / 4;
+                    const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0);
+
+                    const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded;
+                    weight_winograd43_data_int8_packed_cm.create(weight_data_int8_packed_size, blocks_n, 36, (size_t)8u, 1);
+                    if (weight_winograd43_data_int8_packed_cm.empty())
+                        return -100;
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int b = 0; b < 36; b++)
+                    {
+                        for (int bn = 0; bn < blocks_n; bn++)
+                        {
+                            signed char* p0 = weight_winograd43_data_int8_packed_cm.channel(b).row<signed char>(bn);
+
+                            for (int k = 0; k < kk_padded; k += UNROLL_SG_K)
+                            {
+                                for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                                {
+                                    for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                                    {
+                                        for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                                        {
+                                            for (int i = 0; i < coopmat_K; i++)
+                                            {
+                                                for (int j = 0; j < coopmat_Nd4p; j++)
+                                                {
+                                                    for (int jj = 0; jj < 4; jj++)
+                                                    {
+                                                        const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj;
+                                                        const int gki = (k + zk) * coopmat_K + i;
+
+                                                        const int v = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_tm.channel(gni).row<const int>(gki)[b] : 0;
+                                                        int vlow = v & 255;
+                                                        if (vlow >= 128) vlow -= 256;
+                                                        p0[jj] = (signed char)vlow;
+                                                        p0[4 + jj] = (signed char)((v - vlow) >> 8);
+                                                    }
+                                                    p0 += 8;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                {
+                    const int num_input_packed = (num_input + 3) / 4 * 4;
+                    const int num_output_packed = (num_output + 3) / 4 * 4;
+                    const int c4 = num_input_packed / 4;
+
+                    weight_winograd43_data_int8_packed.create(c4, num_output_packed, 36, (size_t)8u, 1);
+
+                    for (int k = 0; k < 36; k++)
+                    {
+                        int* g00 = weight_winograd43_data_int8_packed.channel(k);
+
+                        for (int p = 0; p < num_output_packed; p++)
+                        {
+                            const int* k0 = p < num_output ? weight_data_tm.channel(p) : 0;
+
+                            for (int q = 0; q < num_input_packed; q += 4)
+                            {
+                                const int v0 = k0 && q + 0 < num_input ? k0[(q + 0) * 36 + k] : 0;
+                                const int v1 = k0 && q + 1 < num_input ? k0[(q + 1) * 36 + k] : 0;
+                                const int v2 = k0 && q + 2 < num_input ? k0[(q + 2) * 36 + k] : 0;
+                                const int v3 = k0 && q + 3 < num_input ? k0[(q + 3) * 36 + k] : 0;
+
+                                g00[0] = (int)(((unsigned int)(unsigned short)v0) | ((unsigned int)(unsigned short)v1 << 16));
+                                g00[1] = (int)(((unsigned int)(unsigned short)v2) | ((unsigned int)(unsigned short)v3 << 16));
+                                g00 += 2;
+                            }
+                        }
+                    }
+                }
+            }
+
+            {
+                int block_x = 0;
+                int block_y = 0;
+                Mat shape_winograd_input_transformed;
+
+                if (shape_int8_bordered.dims == 3 && out_shape_blob.dims == 3)
+                {
+                    block_x = (out_shape_blob.w + 3) / 4;
+                    block_y = (out_shape_blob.h + 3) / 4;
+                    if (use_cooperative_matrix)
+                    {
+                        if (elempack == 4)
+                            shape_winograd_input_transformed = Mat(block_x * block_y * 2, 1, c_packed * 36, (void*)0, (size_t)4u, 4);
+                        else
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 36, (void*)0, (size_t)1u, 1);
+                    }
+                    else
+                    {
+                        if (elempack == 4)
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, c_packed * 36, (void*)0, (size_t)8u, 4);
+                        else
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 36, (void*)0, (size_t)2u, 1);
+                    }
+                }
+
+                std::vector<vk_specialization_type> specializations_winograd_input(1 + 6);
+                specializations_winograd_input[0].i = elempack == 4 ? c_packed : num_input;
+                specializations_winograd_input[1 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0;
+                specializations_winograd_input[1 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0;
+                specializations_winograd_input[1 + 2].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0;
+                specializations_winograd_input[1 + 3].i = shape_winograd_input_transformed.dims != 0 ? shape_winograd_input_transformed.cstep : 0;
+                specializations_winograd_input[1 + 4].i = block_x;
+                specializations_winograd_input[1 + 5].i = block_y;
+
+                int shader_type_index = -1;
+                if (use_cooperative_matrix)
+                {
+                    if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_input_int8_cm;
+                    if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm;
+                }
+                else
+                {
+                    if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_input_int8;
+                    if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_input_int8;
+                }
+
+                pipeline_convolution_3x3s1d1_winograd43_transform_input = new Pipeline(vkdev);
+                pipeline_convolution_3x3s1d1_winograd43_transform_input->set_local_size_xyz(4, 4, 1);
+                pipeline_convolution_3x3s1d1_winograd43_transform_input->create(shader_type_index, opt_int8, specializations_winograd_input);
+            }
+            {
+                // winograd23/43 share gemm shader, transform count is set by dispatcher.c
+                pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
+                {
+                    std::vector<vk_specialization_type> specializations_winograd_gemm(5 + 3);
+                    specializations_winograd_gemm[0].i = 36;
+                    specializations_winograd_gemm[1].i = num_input;
+                    specializations_winograd_gemm[2].i = num_output;
+                    specializations_winograd_gemm[3].i = elempack;
+                    specializations_winograd_gemm[4].i = out_elempack;
+                    specializations_winograd_gemm[5 + 0].i = 0;
+                    specializations_winograd_gemm[5 + 1].i = 0;
+                    specializations_winograd_gemm[5 + 2].i = 0;
+
+                    pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(opt_int8.use_shader_local_memory ? 8 : 4, opt_int8.use_shader_local_memory ? 8 : std::min(4, (num_output + 3) / 4), opt_int8.use_shader_local_memory ? 1 : 4);
+                    pipeline_convolution_3x3s1d1_winograd43_gemm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8, opt_int8, specializations_winograd_gemm);
+                }
+
+                if (use_cooperative_matrix)
+                {
+                    std::vector<vk_specialization_type> specializations_winograd_gemm(15 + 3);
+                    specializations_winograd_gemm[0].u32 = 36;
+                    specializations_winograd_gemm[1].u32 = coopmat_M;
+                    specializations_winograd_gemm[2].u32 = coopmat_N;
+                    specializations_winograd_gemm[3].u32 = coopmat_K;
+                    specializations_winograd_gemm[4].u32 = UNROLL_SG_M;
+                    specializations_winograd_gemm[5].u32 = UNROLL_SG_N;
+                    specializations_winograd_gemm[6].u32 = UNROLL_SG_K;
+                    specializations_winograd_gemm[7].u32 = UNROLL_WG_M;
+                    specializations_winograd_gemm[8].u32 = UNROLL_WG_N;
+                    specializations_winograd_gemm[9].u32 = coopmat_subgroup_size;
+                    specializations_winograd_gemm[10].u32 = num_input;
+                    specializations_winograd_gemm[11].u32 = num_output;
+                    specializations_winograd_gemm[12].u32 = elempack;
+                    specializations_winograd_gemm[13].u32 = out_elempack;
+                    specializations_winograd_gemm[14].u32 = weight_winograd43_data_int8_packed_cm.cstep;
+                    specializations_winograd_gemm[15 + 0].u32 = 0;
+                    specializations_winograd_gemm[15 + 1].u32 = 0;
+                    specializations_winograd_gemm[15 + 2].u32 = 0;
+
+                    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm = new Pipeline(vkdev);
+                    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->set_subgroup_size(coopmat_subgroup_size);
+                    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1);
+                    pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8_cm, opt_int8, specializations_winograd_gemm);
+                }
+            }
+            {
+                std::vector<vk_specialization_type> specializations_winograd_output(5);
+                specializations_winograd_output[0].i = bias_term;
+                specializations_winograd_output[1].i = activation_type;
+                specializations_winograd_output[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+                specializations_winograd_output[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+                specializations_winograd_output[4].i = use_int8_requantize ? 1 : 0;
+
+                int shader_type_index = -1;
+                if (out_elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd43_transform_output_int8;
+                if (out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_transform_output_int8;
+
+                pipeline_convolution_3x3s1d1_winograd43_transform_output = new Pipeline(vkdev);
+                pipeline_convolution_3x3s1d1_winograd43_transform_output->set_local_size_xyz(4, 4, 1);
+                pipeline_convolution_3x3s1d1_winograd43_transform_output->create(shader_type_index, opt_int8, specializations_winograd_output);
+            }
+        }
+
+        if (use_winograd23)
+        {
+            Mat weight_data_tm;
+            weight_data_tm.create(16, num_input, num_output);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                for (int q = 0; q < num_input; q++)
+                {
+                    const signed char* kernel0 = (const signed char*)weight_data + p * num_input * 9 + q * 9;
+                    int* kernel_tm0 = weight_data_tm.channel(p).row<int>(q);
+
+                    int tmp[4][3];
+                    for (int m = 0; m < 3; m++)
+                    {
+                        const int r0 = kernel0[0];
+                        const int r1 = kernel0[1];
+                        const int r2 = kernel0[2];
+
+                        tmp[0][m] = r0 * 2;
+                        tmp[1][m] = r0 + r1 + r2;
+                        tmp[2][m] = r0 - r1 + r2;
+                        tmp[3][m] = r2 * 2;
+
+                        kernel0 += 3;
+                    }
+
+                    for (int m = 0; m < 4; m++)
+                    {
+                        const int r0 = tmp[m][0];
+                        const int r1 = tmp[m][1];
+                        const int r2 = tmp[m][2];
+
+                        kernel_tm0[m * 4 + 0] = r0 * 2;
+                        kernel_tm0[m * 4 + 1] = r0 + r1 + r2;
+                        kernel_tm0[m * 4 + 2] = r0 - r1 + r2;
+                        kernel_tm0[m * 4 + 3] = r2 * 2;
+                    }
+                }
+            }
+
+            {
+                if (use_cooperative_matrix)
+                {
+                    const int blocks_n = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+                    const int kk = (num_input + coopmat_K - 1) / coopmat_K;
+                    const int kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+                    const int coopmat_Nd4 = coopmat_N / 4;
+                    const int coopmat_Nd4p = coopmat_Nd4 + (vkdev->info.support_VK_KHR_cooperative_matrix() ? 1 : 0);
+
+                    const int weight_data_int8_packed_size = coopmat_Nd4p * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk_padded;
+                    weight_winograd23_data_int8_packed_cm.create(weight_data_int8_packed_size, blocks_n, 16, (size_t)8u, 1);
+                    if (weight_winograd23_data_int8_packed_cm.empty())
+                        return -100;
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int b = 0; b < 16; b++)
+                    {
+                        for (int bn = 0; bn < blocks_n; bn++)
+                        {
+                            signed char* p0 = weight_winograd23_data_int8_packed_cm.channel(b).row<signed char>(bn);
+
+                            for (int k = 0; k < kk_padded; k += UNROLL_SG_K)
+                            {
+                                for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                                {
+                                    for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                                    {
+                                        for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                                        {
+                                            for (int i = 0; i < coopmat_K; i++)
+                                            {
+                                                for (int j = 0; j < coopmat_Nd4p; j++)
+                                                {
+                                                    for (int jj = 0; jj < 4; jj++)
+                                                    {
+                                                        const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j * 4 + jj;
+                                                        const int gki = (k + zk) * coopmat_K + i;
+
+                                                        const int v = j < coopmat_Nd4 && gni < num_output && gki < num_input ? weight_data_tm.channel(gni).row<const int>(gki)[b] : 0;
+                                                        int vlow = v & 255;
+                                                        if (vlow >= 128) vlow -= 256;
+                                                        p0[jj] = (signed char)vlow;
+                                                        p0[4 + jj] = (signed char)((v - vlow) >> 8);
+                                                    }
+                                                    p0 += 8;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                {
+                    const int num_input_packed = (num_input + 3) / 4 * 4;
+                    const int num_output_packed = (num_output + 3) / 4 * 4;
+                    const int c4 = num_input_packed / 4;
+
+                    weight_winograd23_data_int8_packed.create(c4, num_output_packed, 16, (size_t)8u, 1);
+
+                    for (int k = 0; k < 16; k++)
+                    {
+                        int* g00 = weight_winograd23_data_int8_packed.channel(k);
+
+                        for (int p = 0; p < num_output_packed; p++)
+                        {
+                            const int* k0 = p < num_output ? weight_data_tm.channel(p) : 0;
+
+                            for (int q = 0; q < num_input_packed; q += 4)
+                            {
+                                const int v0 = k0 && q + 0 < num_input ? k0[(q + 0) * 16 + k] : 0;
+                                const int v1 = k0 && q + 1 < num_input ? k0[(q + 1) * 16 + k] : 0;
+                                const int v2 = k0 && q + 2 < num_input ? k0[(q + 2) * 16 + k] : 0;
+                                const int v3 = k0 && q + 3 < num_input ? k0[(q + 3) * 16 + k] : 0;
+
+                                g00[0] = (int)(((unsigned int)(unsigned short)v0) | ((unsigned int)(unsigned short)v1 << 16));
+                                g00[1] = (int)(((unsigned int)(unsigned short)v2) | ((unsigned int)(unsigned short)v3 << 16));
+                                g00 += 2;
+                            }
+                        }
+                    }
+                }
+            }
+
+            {
+                int block_x = 0;
+                int block_y = 0;
+                Mat shape_winograd_input_transformed;
+
+                if (shape_int8_bordered.dims == 3 && out_shape_blob.dims == 3)
+                {
+                    block_x = (out_shape_blob.w + 1) / 2;
+                    block_y = (out_shape_blob.h + 1) / 2;
+                    if (use_cooperative_matrix)
+                    {
+                        if (elempack == 4)
+                            shape_winograd_input_transformed = Mat(block_x * block_y * 2, 1, c_packed * 16, (void*)0, (size_t)4u, 4);
+                        else
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 16, (void*)0, (size_t)1u, 1);
+                    }
+                    else
+                    {
+                        if (elempack == 4)
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, c_packed * 16, (void*)0, (size_t)8u, 4);
+                        else
+                            shape_winograd_input_transformed = Mat(block_x * block_y, 1, num_input * 16, (void*)0, (size_t)2u, 1);
+                    }
+                }
+
+                std::vector<vk_specialization_type> specializations_winograd_input(1 + 6);
+                specializations_winograd_input[0].i = elempack == 4 ? c_packed : num_input;
+                specializations_winograd_input[1 + 0].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0;
+                specializations_winograd_input[1 + 1].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0;
+                specializations_winograd_input[1 + 2].i = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0;
+                specializations_winograd_input[1 + 3].i = shape_winograd_input_transformed.dims != 0 ? shape_winograd_input_transformed.cstep : 0;
+                specializations_winograd_input[1 + 4].i = block_x;
+                specializations_winograd_input[1 + 5].i = block_y;
+
+                int shader_type_index = -1;
+                if (use_cooperative_matrix)
+                {
+                    if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_input_int8_cm;
+                    if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm;
+                }
+                else
+                {
+                    if (elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_input_int8;
+                    if (elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_input_int8;
+                }
+
+                pipeline_convolution_3x3s1d1_winograd23_transform_input = new Pipeline(vkdev);
+                pipeline_convolution_3x3s1d1_winograd23_transform_input->set_local_size_xyz(8, 8, 1);
+                pipeline_convolution_3x3s1d1_winograd23_transform_input->create(shader_type_index, opt_int8, specializations_winograd_input);
+            }
+            {
+                // winograd23/43 share gemm shader, transform count is set by dispatcher.c
+                pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
+                {
+                    std::vector<vk_specialization_type> specializations_winograd_gemm(5 + 3);
+                    specializations_winograd_gemm[0].i = 16;
+                    specializations_winograd_gemm[1].i = num_input;
+                    specializations_winograd_gemm[2].i = num_output;
+                    specializations_winograd_gemm[3].i = elempack;
+                    specializations_winograd_gemm[4].i = out_elempack;
+                    specializations_winograd_gemm[5 + 0].i = 0;
+                    specializations_winograd_gemm[5 + 1].i = 0;
+                    specializations_winograd_gemm[5 + 2].i = 0;
+
+                    pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(opt_int8.use_shader_local_memory ? 8 : 4, opt_int8.use_shader_local_memory ? 8 : std::min(4, (num_output + 3) / 4), opt_int8.use_shader_local_memory ? 1 : 4);
+                    pipeline_convolution_3x3s1d1_winograd23_gemm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8, opt_int8, specializations_winograd_gemm);
+                }
+
+                if (use_cooperative_matrix)
+                {
+                    std::vector<vk_specialization_type> specializations_winograd_gemm(15 + 3);
+                    specializations_winograd_gemm[0].u32 = 16;
+                    specializations_winograd_gemm[1].u32 = coopmat_M;
+                    specializations_winograd_gemm[2].u32 = coopmat_N;
+                    specializations_winograd_gemm[3].u32 = coopmat_K;
+                    specializations_winograd_gemm[4].u32 = UNROLL_SG_M;
+                    specializations_winograd_gemm[5].u32 = UNROLL_SG_N;
+                    specializations_winograd_gemm[6].u32 = UNROLL_SG_K;
+                    specializations_winograd_gemm[7].u32 = UNROLL_WG_M;
+                    specializations_winograd_gemm[8].u32 = UNROLL_WG_N;
+                    specializations_winograd_gemm[9].u32 = coopmat_subgroup_size;
+                    specializations_winograd_gemm[10].u32 = num_input;
+                    specializations_winograd_gemm[11].u32 = num_output;
+                    specializations_winograd_gemm[12].u32 = elempack;
+                    specializations_winograd_gemm[13].u32 = out_elempack;
+                    specializations_winograd_gemm[14].u32 = weight_winograd23_data_int8_packed_cm.cstep;
+                    specializations_winograd_gemm[15 + 0].u32 = 0;
+                    specializations_winograd_gemm[15 + 1].u32 = 0;
+                    specializations_winograd_gemm[15 + 2].u32 = 0;
+
+                    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm = new Pipeline(vkdev);
+                    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->set_subgroup_size(coopmat_subgroup_size);
+                    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1);
+                    pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm->create(LayerShaderType::convolution_3x3s1d1_winograd_gemm_int8_cm, opt_int8, specializations_winograd_gemm);
+                }
+            }
+            {
+                std::vector<vk_specialization_type> specializations_winograd_output(5);
+                specializations_winograd_output[0].i = bias_term;
+                specializations_winograd_output[1].i = activation_type;
+                specializations_winograd_output[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+                specializations_winograd_output[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+                specializations_winograd_output[4].i = use_int8_requantize ? 1 : 0;
+
+                int shader_type_index = -1;
+                if (out_elempack == 1) shader_type_index = LayerShaderType::convolution_3x3s1d1_winograd23_transform_output_int8;
+                if (out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_transform_output_int8;
+
+                pipeline_convolution_3x3s1d1_winograd23_transform_output = new Pipeline(vkdev);
+                pipeline_convolution_3x3s1d1_winograd23_transform_output->set_local_size_xyz(8, 8, 1);
+                pipeline_convolution_3x3s1d1_winograd23_transform_output->create(shader_type_index, opt_int8, specializations_winograd_output);
+            }
+        }
+    }
+    else if (is_conv1x1s1d1)
+    {
+        if (use_cooperative_matrix)
+        {
+            std::vector<vk_specialization_type> specializations_1x1(7 + 5 + 9);
+            specializations_1x1[0].i = bias_term;
+            specializations_1x1[1].i = activation_type;
+            specializations_1x1[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+            specializations_1x1[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+            specializations_1x1[4].i = use_int8_requantize ? 1 : 0;
+            specializations_1x1[5].u32 = elempack;
+            specializations_1x1[6].u32 = out_elempack;
+            specializations_1x1[7 + 0].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0;
+            specializations_1x1[7 + 1].u32 = out_shape_blob.dims != 0 ? out_shape_blob.cstep : 0;
+            specializations_1x1[7 + 2].u32 = out_shape.dims != 0 ? out_shape.w * out_shape.h : 0;
+            specializations_1x1[7 + 3].u32 = num_output;
+            specializations_1x1[7 + 4].u32 = num_input;
+            specializations_1x1[12 + 0].u32 = coopmat_M;
+            specializations_1x1[12 + 1].u32 = coopmat_N;
+            specializations_1x1[12 + 2].u32 = coopmat_K;
+            specializations_1x1[12 + 3].u32 = coopmat_subgroup_size;
+            specializations_1x1[12 + 4].u32 = UNROLL_SG_M;
+            specializations_1x1[12 + 5].u32 = UNROLL_SG_N;
+            specializations_1x1[12 + 6].u32 = UNROLL_SG_K;
+            specializations_1x1[12 + 7].u32 = UNROLL_WG_M;
+            specializations_1x1[12 + 8].u32 = UNROLL_WG_N;
+
+            pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
+            pipeline_convolution_1x1s1d1->set_subgroup_size(coopmat_subgroup_size);
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1);
+            pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1_int8_cm, opt_int8, specializations_1x1);
+        }
+        else
+        {
+            const int num_input_packed = (num_input + 3) / 4 * 4;
+            const int num_output_packed = (num_output + 3) / 4 * 4;
+
+            const int c_packed = num_input_packed / 4;
+            const int cstep_vec4 = shape_int8_bordered.dims != 0 ? (elempack == 4 ? shape_int8_bordered.cstep : shape_int8_bordered.cstep / 4) : 0;
+            const int outc_pack4 = num_output_packed / 4;
+
+            std::vector<vk_specialization_type> specializations_1x1(7 + 8);
+            specializations_1x1[0].i = bias_term;
+            specializations_1x1[1].i = activation_type;
+            specializations_1x1[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+            specializations_1x1[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+            specializations_1x1[4].i = use_int8_requantize ? 1 : 0;
+            specializations_1x1[5].i = elempack;
+            specializations_1x1[6].i = out_elempack;
+            specializations_1x1[7 + 0].i = c_packed;
+            specializations_1x1[7 + 1].i = cstep_vec4;
+            specializations_1x1[7 + 2].i = out_shape.dims != 0 ? outc_pack4 : 0;
+            specializations_1x1[7 + 3].i = out_shape_blob.dims != 0 ? (out_elempack == 4 ? out_shape_blob.cstep : out_shape_blob.cstep / 4) : 0;
+            specializations_1x1[7 + 4].i = out_shape_blob.dims != 0 ? out_shape_blob.cstep / 4 : 0;
+            specializations_1x1[7 + 5].i = out_shape.dims != 0 ? (out_shape.w * out_shape.h + 3) / 4 : 0;
+            specializations_1x1[7 + 6].i = num_output;
+            specializations_1x1[7 + 7].i = num_input;
+
+            Mat local_size_xyz(8, std::min(8, outc_pack4), 1, (void*)0);
+
+            pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
+            if (opt_int8.use_shader_local_memory)
+            {
+                pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1);
+            }
+            else
+            {
+                pipeline_convolution_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz);
+            }
+            pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_packed_1x1s1d1_int8, opt_int8, specializations_1x1);
+        }
+    }
+    else if (use_gemm)
+    {
+        if (use_cooperative_matrix)
+        {
+            std::vector<vk_specialization_type> specializations_gemm(13 + 8 + 9);
+            specializations_gemm[0].u32 = kernel_w;
+            specializations_gemm[1].u32 = kernel_h;
+            specializations_gemm[2].u32 = dilation_w;
+            specializations_gemm[3].u32 = dilation_h;
+            specializations_gemm[4].u32 = stride_w;
+            specializations_gemm[5].u32 = stride_h;
+            specializations_gemm[6].i = bias_term;
+            specializations_gemm[7].i = activation_type;
+            specializations_gemm[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+            specializations_gemm[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+            specializations_gemm[10].i = use_int8_requantize ? 1 : 0;
+            specializations_gemm[11].u32 = elempack;
+            specializations_gemm[12].u32 = out_elempack;
+            specializations_gemm[13 + 0].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.w : 0;
+            specializations_gemm[13 + 1].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.h : 0;
+            specializations_gemm[13 + 2].u32 = shape_int8_bordered.dims != 0 ? shape_int8_bordered.cstep : 0;
+            specializations_gemm[13 + 3].u32 = out_shape_blob.dims != 0 ? out_shape_blob.w : 0;
+            specializations_gemm[13 + 4].u32 = out_shape_blob.dims != 0 ? out_shape_blob.h : 0;
+            specializations_gemm[13 + 5].u32 = out_shape_blob.dims != 0 ? out_shape_blob.cstep : 0;
+            specializations_gemm[13 + 6].u32 = num_output;
+            specializations_gemm[13 + 7].u32 = num_input;
+            specializations_gemm[21 + 0].u32 = coopmat_M;
+            specializations_gemm[21 + 1].u32 = coopmat_N;
+            specializations_gemm[21 + 2].u32 = coopmat_K;
+            specializations_gemm[21 + 3].u32 = coopmat_subgroup_size;
+            specializations_gemm[21 + 4].u32 = UNROLL_SG_M;
+            specializations_gemm[21 + 5].u32 = UNROLL_SG_N;
+            specializations_gemm[21 + 6].u32 = UNROLL_SG_K;
+            specializations_gemm[21 + 7].u32 = UNROLL_WG_M;
+            specializations_gemm[21 + 8].u32 = UNROLL_WG_N;
+
+            pipeline_convolution_gemm = new Pipeline(vkdev);
+            pipeline_convolution_gemm->set_subgroup_size(coopmat_subgroup_size);
+            pipeline_convolution_gemm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1);
+            pipeline_convolution_gemm->create(LayerShaderType::convolution_gemm_int8_cm, opt_int8, specializations_gemm);
+        }
+        else
+        {
+            const int outc_pack4 = (num_output + 3) / 4;
+            const int outsize = shape.dims == 3 ? (shape.w * shape.h + 3) / 4 : 16;
+            Mat local_size_xyz(std::min(8, outsize), std::min(8, outc_pack4), 1, (void*)0);
+
+            pipeline_convolution_gemm = new Pipeline(vkdev);
+            if (opt_int8.use_shader_local_memory)
+            {
+                pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
+            }
+            else
+            {
+                pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
+            }
+            pipeline_convolution_gemm->create(LayerShaderType::convolution_packed_gemm_int8, opt_int8, specializations);
+        }
+    }
+    else
+    {
+        const int outc_pack4 = (num_output + 3) / 4;
+        Mat local_size_xyz(8, 8, std::min(4, (outc_pack4 + 1) / 2), (void*)0);
+
+        std::vector<vk_specialization_type> specializations_direct = specializations;
+        for (int i = 0; i < 8; i++)
+        {
+            specializations_direct[13 + i].i = 0;
+        }
+
+        pipeline_convolution = new Pipeline(vkdev);
+        pipeline_convolution->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution->create(LayerShaderType::convolution_packed_int8, opt_int8, specializations_direct);
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt)
+{
+    Option opt_fp32 = opt;
+    opt_fp32.use_fp16_packed = false;
+    opt_fp32.use_fp16_storage = false;
+    opt_fp32.use_bf16_packed = false;
+    opt_fp32.use_bf16_storage = false;
+
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    const bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    const bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16;
+
+    if (use_winograd)
+    {
+        if (use_cooperative_matrix)
+        {
+            if (!weight_winograd43_data_int8_packed_cm.empty())
+            {
+                cmd.record_upload(weight_winograd43_data_int8_packed_cm, weight_data_gpu_tm_winograd43_int8_cm, opt);
+
+                weight_winograd43_data_int8_packed_cm.release();
+            }
+
+            if (!weight_winograd23_data_int8_packed_cm.empty())
+            {
+                cmd.record_upload(weight_winograd23_data_int8_packed_cm, weight_data_gpu_tm_winograd23_int8_cm, opt);
+
+                weight_winograd23_data_int8_packed_cm.release();
+            }
+
+            weight_winograd43_data_int8_packed.release();
+            weight_winograd23_data_int8_packed.release();
+        }
+        else
+        {
+            weight_winograd43_data_int8_packed_cm.release();
+            weight_winograd23_data_int8_packed_cm.release();
+
+            if (!weight_winograd43_data_int8_packed.empty())
+            {
+                cmd.record_upload(weight_winograd43_data_int8_packed, weight_data_gpu_tm_winograd43, opt_fp32);
+
+                weight_winograd43_data_int8_packed.release();
+            }
+
+            if (!weight_winograd23_data_int8_packed.empty())
+            {
+                cmd.record_upload(weight_winograd23_data_int8_packed, weight_data_gpu_tm_winograd23, opt_fp32);
+
+                weight_winograd23_data_int8_packed.release();
+            }
+        }
+
+        weight_data_int8_packed.release();
+    }
+    else
+    {
+        cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt);
+
+        weight_data_int8_packed.release();
+    }
+
+    cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt_fp32);
+
+    weight_data_int8_descales.release();
+
+    const bool use_int8_requantize = int8_scale_term > 100;
+    if (use_int8_requantize)
+    {
+        cmd.record_upload(top_blob_int8_scales, top_blob_int8_scales_gpu, opt);
+    }
+
+    if (bias_term)
+    {
+        cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt_fp32);
+
+        bias_data_int8_packed.release();
+        bias_data.release();
+    }
+
+    if (padding)
+    {
+        padding->upload_model(cmd, opt);
+    }
+
+    quantize->upload_model(cmd, opt);
+
+    return 0;
+}
+
+int Convolution_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+    const bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    const bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    const bool use_winograd = opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16;
+    const bool use_gemm = opt.use_sgemm_convolution && !is_conv1x1s1d1 && !use_winograd && num_input * maxk >= 8 && num_output >= 8;
+
+    // flattened blob, implement as InnerProduct
+    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+    {
+        NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+        return -1;
+    }
+
+    if (bottom_blob.dims != 3)
+    {
+        NCNN_LOGE("Convolution_vulkan int8 only supports 3d input for now");
+        return -1;
+    }
+
+    VkMat bottom = bottom_blob;
+    bool bottom_is_int8 = bottom.elembits() == 8;
+
+    const int elempack = opt.use_packing_layout && num_input % 4 == 0 ? 4 : 1;
+
+    if (!bottom_is_int8)
+    {
+        Option opt_quantize = opt;
+        opt_quantize.blob_vkallocator = opt.workspace_vkallocator;
+        opt_quantize.use_fp16_arithmetic = false;
+
+        VkMat bottom_int8;
+        int ret = quantize->forward(bottom, bottom_int8, cmd, opt_quantize);
+        if (ret != 0)
+            return ret;
+
+        bottom = bottom_int8;
+        bottom_is_int8 = true;
+    }
+
+    int w = bottom.w;
+    int h = bottom.h;
+    const int channels = bottom.c * bottom.elempack;
+
+    if (channels != num_input)
+    {
+        NCNN_LOGE("Convolution_vulkan int8 input channels mismatch");
+        return -1;
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        VkMat bottom_bordered;
+        int ret = padding->forward(bottom, bottom_bordered, cmd, opt_pad);
+        if (ret != 0)
+            return ret;
+
+        bottom = bottom_bordered;
+        w = bottom.w;
+        h = bottom.h;
+    }
+    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = hpad / 2;
+            padding_params[1] = hpad - hpad / 2;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            if (ret != 0)
+                return ret;
+
+            bottom = padding_outputs[0];
+            w = bottom.w;
+            h = bottom.h;
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = hpad - hpad / 2;
+            padding_params[1] = hpad / 2;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            if (ret != 0)
+                return ret;
+
+            bottom = padding_outputs[0];
+            w = bottom.w;
+            h = bottom.h;
+        }
+    }
+
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+    const int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    const bool use_int8_requantize = int8_scale_term > 100;
+    const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
+    size_t out_elemsize;
+    if (use_int8_requantize)
+    {
+        out_elemsize = out_elempack;
+    }
+    else if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+    {
+        out_elemsize = (size_t)2u * out_elempack;
+    }
+    else
+    {
+        out_elemsize = (size_t)4u * out_elempack;
+    }
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (use_winograd)
+    {
+        bool pre_winograd43 = opt.use_winograd43_convolution;
+        const int w_bordered = w;
+        const int h_bordered = h;
+        if (opt.use_winograd23_convolution)
+        {
+            if (vkdev->info.type() == 0 && ((w_bordered <= 18 && h_bordered <= 18) || ((w_bordered >= 23 && w_bordered <= 24) && (h_bordered >= 23 && h_bordered <= 24))))
+                pre_winograd43 = false;
+            if (vkdev->info.type() != 0 && (w_bordered <= 12 && h_bordered <= 12))
+                pre_winograd43 = false;
+        }
+
+        const int B = pre_winograd43 ? 36 : 16;
+        const int c4 = (channels + 3) / 4;
+        const int block_x = pre_winograd43 ? (outw + 3) / 4 : (outw + 1) / 2;
+        const int block_y = pre_winograd43 ? (outh + 3) / 4 : (outh + 1) / 2;
+
+        VkMat bottom_tm_blob;
+        VkMat bottom_tm_blob_low;
+        VkMat bottom_tm_blob_high;
+        {
+            if (use_cooperative_matrix)
+            {
+                if (elempack == 4)
+                {
+                    bottom_tm_blob_low.create(block_x * block_y * 2, 1, c4 * B, (size_t)4u, 4, opt.workspace_vkallocator);
+                    bottom_tm_blob_high = bottom_tm_blob_low;
+                }
+                else
+                {
+                    bottom_tm_blob_low.create(block_x * block_y, 1, channels * B, (size_t)1u, 1, opt.workspace_vkallocator);
+                    bottom_tm_blob_high.create(block_x * block_y, 1, channels * B, (size_t)1u, 1, opt.workspace_vkallocator);
+                }
+                if (bottom_tm_blob_low.empty() || bottom_tm_blob_high.empty())
+                    return -100;
+            }
+            else
+            {
+                if (elempack == 4)
+                    bottom_tm_blob.create(block_x * block_y, 1, c4 * B, (size_t)8u, 4, opt.workspace_vkallocator);
+                else
+                    bottom_tm_blob.create(block_x * block_y, 1, channels * B, (size_t)2u, 1, opt.workspace_vkallocator);
+                if (bottom_tm_blob.empty())
+                    return -100;
+            }
+
+            std::vector<VkMat> bindings(use_cooperative_matrix && elempack == 1 ? 3 : 2);
+            bindings[0] = bottom;
+            if (use_cooperative_matrix)
+            {
+                bindings[1] = bottom_tm_blob_low;
+                if (elempack == 1)
+                    bindings[2] = bottom_tm_blob_high;
+            }
+            else
+            {
+                bindings[1] = bottom_tm_blob;
+            }
+
+            std::vector<vk_constant_type> constants(7);
+            constants[0].i = bottom.w;
+            constants[1].i = bottom.h;
+            constants[2].i = bottom.cstep;
+            constants[3].i = use_cooperative_matrix ? bottom_tm_blob_low.cstep : bottom_tm_blob.cstep;
+            constants[4].i = block_x;
+            constants[5].i = block_y;
+            constants[6].i = elempack == 4 ? c4 : channels;
+
+            VkMat dispatcher;
+            dispatcher.w = block_x;
+            dispatcher.h = block_y;
+            dispatcher.c = use_cooperative_matrix ? bottom_tm_blob_low.c / B : bottom_tm_blob.c / B;
+
+            const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_transform_input : pipeline_convolution_3x3s1d1_winograd23_transform_input;
+            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+        }
+
+        VkMat top_tm_blob;
+        {
+            top_tm_blob.create(block_x * block_y, 1, num_output / out_elempack * B, (size_t)4u * out_elempack, out_elempack, opt.workspace_vkallocator);
+            if (top_tm_blob.empty())
+                return -100;
+
+            if (use_cooperative_matrix)
+            {
+                std::vector<VkMat> bindings(6);
+                bindings[0] = bottom_tm_blob_low;
+                bindings[1] = bottom_tm_blob_high;
+                bindings[2] = bottom_tm_blob_low;
+                bindings[3] = top_tm_blob;
+                bindings[4] = top_tm_blob;
+                bindings[5] = pre_winograd43 ? weight_data_gpu_tm_winograd43_int8_cm : weight_data_gpu_tm_winograd23_int8_cm;
+
+                std::vector<vk_constant_type> constants(3);
+                constants[0].i = top_tm_blob.w;
+                constants[1].i = bottom_tm_blob_low.cstep;
+                constants[2].i = top_tm_blob.cstep;
+
+                const int blocks_x = (top_tm_blob.w + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M);
+                const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+
+                VkMat dispatcher;
+                dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N);
+                dispatcher.h = 1;
+                dispatcher.c = B;
+
+                const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm : pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm;
+                cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+            }
+            else
+            {
+                std::vector<VkMat> bindings(5);
+                bindings[0] = bottom_tm_blob;
+                bindings[1] = bottom_tm_blob;
+                bindings[2] = top_tm_blob;
+                bindings[3] = top_tm_blob;
+                bindings[4] = pre_winograd43 ? weight_data_gpu_tm_winograd43 : weight_data_gpu_tm_winograd23;
+
+                std::vector<vk_constant_type> constants(3);
+                constants[0].i = bottom_tm_blob.cstep;
+                constants[1].i = top_tm_blob.w;
+                constants[2].i = top_tm_blob.cstep;
+
+                VkMat dispatcher;
+                dispatcher.w = (top_tm_blob.w + 3) / 4;
+                dispatcher.h = (num_output + 3) / 4;
+                dispatcher.c = B;
+
+                const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_gemm : pipeline_convolution_3x3s1d1_winograd23_gemm;
+                cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+            }
+        }
+
+        {
+            std::vector<VkMat> bindings(6);
+            bindings[0] = top_tm_blob;
+            bindings[1] = top_blob;
+            bindings[2] = bias_data_gpu;
+            bindings[3] = weight_data_int8_descales_gpu;
+            bindings[4] = top_blob_int8_scales_gpu;
+            bindings[5] = top_blob;
+
+            std::vector<vk_constant_type> constants(7);
+            constants[0].i = top_tm_blob.cstep;
+            constants[1].i = block_x;
+            constants[2].i = block_y;
+            constants[3].i = top_blob.w;
+            constants[4].i = top_blob.h;
+            constants[5].i = top_blob.cstep;
+            constants[6].i = num_output;
+
+            VkMat dispatcher;
+            dispatcher.w = block_x;
+            dispatcher.h = block_y;
+            dispatcher.c = top_blob.c;
+
+            const Pipeline* pipeline = pre_winograd43 ? pipeline_convolution_3x3s1d1_winograd43_transform_output : pipeline_convolution_3x3s1d1_winograd23_transform_output;
+            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+        }
+    }
+    else if (is_conv1x1s1d1)
+    {
+        if (use_cooperative_matrix)
+        {
+            std::vector<VkMat> bindings_1x1(7);
+            bindings_1x1[0] = bottom;
+            bindings_1x1[1] = top_blob;
+            bindings_1x1[2] = top_blob;
+            bindings_1x1[3] = weight_data_gpu;
+            bindings_1x1[4] = bias_data_gpu;
+            bindings_1x1[5] = weight_data_int8_descales_gpu;
+            bindings_1x1[6] = top_blob_int8_scales_gpu;
+
+            const int size = top_blob.w * top_blob.h;
+
+            std::vector<vk_constant_type> constants_1x1(5);
+            constants_1x1[0].u32 = bottom.cstep;
+            constants_1x1[1].u32 = top_blob.cstep;
+            constants_1x1[2].u32 = size;
+            constants_1x1[3].u32 = num_output;
+            constants_1x1[4].u32 = num_input;
+
+            const int blocks_x = (size + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M);
+            const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+
+            VkMat dispatcher;
+            dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N);
+            dispatcher.h = 1;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings_1x1, constants_1x1, dispatcher);
+        }
+        else
+        {
+            std::vector<VkMat> bindings_1x1(7);
+            bindings_1x1[0] = bottom;
+            bindings_1x1[1] = top_blob;
+            bindings_1x1[2] = weight_data_gpu;
+            bindings_1x1[3] = bias_data_gpu;
+            bindings_1x1[4] = weight_data_int8_descales_gpu;
+            bindings_1x1[5] = top_blob_int8_scales_gpu;
+            bindings_1x1[6] = top_blob;
+
+            const int num_input_packed = (num_input + 3) / 4 * 4;
+            const int num_output_packed = (num_output + 3) / 4 * 4;
+            const int outc_pack4 = num_output_packed / 4;
+            const int c_packed = num_input_packed / 4;
+            const int cstep_vec4 = bottom.elempack == 4 ? bottom.cstep : bottom.cstep / 4;
+            const int size = (top_blob.w * top_blob.h + 3) / 4;
+            const int outcstep_vec4 = out_elempack == 4 ? top_blob.cstep : top_blob.cstep / 4;
+            const int outcstep_native = top_blob.cstep / 4;
+
+            std::vector<vk_constant_type> constants_1x1(6);
+            constants_1x1[0].i = c_packed;
+            constants_1x1[1].i = cstep_vec4;
+            constants_1x1[2].i = outc_pack4;
+            constants_1x1[3].i = outcstep_vec4;
+            constants_1x1[4].i = outcstep_native;
+            constants_1x1[5].i = size;
+
+            VkMat dispatcher;
+            dispatcher.w = size;
+            dispatcher.h = outc_pack4;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings_1x1, constants_1x1, dispatcher);
+        }
+    }
+    else if (use_gemm)
+    {
+        if (use_cooperative_matrix)
+        {
+            std::vector<VkMat> bindings(7);
+            bindings[0] = bottom;
+            bindings[1] = top_blob;
+            bindings[2] = top_blob;
+            bindings[3] = weight_data_gpu;
+            bindings[4] = bias_data_gpu;
+            bindings[5] = weight_data_int8_descales_gpu;
+            bindings[6] = top_blob_int8_scales_gpu;
+
+            std::vector<vk_constant_type> constants(8);
+            constants[0].u32 = bottom.w;
+            constants[1].u32 = bottom.h;
+            constants[2].u32 = bottom.cstep;
+            constants[3].u32 = top_blob.w;
+            constants[4].u32 = top_blob.h;
+            constants[5].u32 = top_blob.cstep;
+            constants[6].u32 = num_output;
+            constants[7].u32 = num_input;
+
+            const int size = top_blob.w * top_blob.h;
+            const int blocks_x = (size + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M);
+            const int blocks_y = (num_output + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+
+            VkMat dispatcher;
+            dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N);
+            dispatcher.h = 1;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
+        }
+        else
+        {
+            const int c_packed = (num_input + 3) / 4;
+            const int outc_pack4 = (num_output + 3) / 4;
+
+            std::vector<VkMat> bindings(10);
+            bindings[0] = bottom;
+            bindings[1] = top_blob;
+            bindings[2] = bottom;
+            bindings[3] = top_blob;
+            bindings[4] = weight_data_gpu;
+            bindings[5] = bias_data_gpu;
+            bindings[6] = weight_data_int8_descales_gpu;
+            bindings[7] = top_blob_int8_scales_gpu;
+            bindings[8] = top_blob;
+            bindings[9] = top_blob;
+
+            std::vector<vk_constant_type> constants(8);
+            constants[0].i = bottom.w;
+            constants[1].i = bottom.h;
+            constants[2].i = c_packed;
+            constants[3].i = bottom.cstep;
+            constants[4].i = top_blob.w;
+            constants[5].i = top_blob.h;
+            constants[6].i = outc_pack4;
+            constants[7].i = out_elempack == 4 ? top_blob.cstep : top_blob.cstep * 4;
+
+            VkMat dispatcher;
+            dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+            dispatcher.h = outc_pack4;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
+        }
+    }
+    else
+    {
+        const int c_packed = (num_input + 3) / 4;
+        const int c_shader = bottom.elempack == 4 ? c_packed : num_input;
+        const int outc_pack4 = (num_output + 3) / 4;
+
+        std::vector<VkMat> bindings(10);
+        bindings[0] = bottom;
+        bindings[1] = top_blob;
+        bindings[2] = bottom;
+        bindings[3] = top_blob;
+        bindings[4] = weight_data_gpu;
+        bindings[5] = bias_data_gpu;
+        bindings[6] = weight_data_int8_descales_gpu;
+        bindings[7] = top_blob_int8_scales_gpu;
+        bindings[8] = top_blob;
+        bindings[9] = top_blob;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = bottom.w;
+        constants[1].i = bottom.h;
+        constants[2].i = c_shader;
+        constants[3].i = bottom.cstep;
+        constants[4].i = top_blob.w;
+        constants[5].i = top_blob.h;
+        constants[6].i = outc_pack4;
+        constants[7].i = out_elempack == 4 ? top_blob.cstep : top_blob.cstep * 4;
+
+        VkMat dispatcher;
+        dispatcher.w = (top_blob.w + 1) / 2;
+        dispatcher.h = (top_blob.h + 1) / 2;
+        dispatcher.c = (outc_pack4 + 1) / 2;
+
+        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h
index eeb6bba07d26..d03c095f11d4 100644
--- a/src/layer/vulkan/convolution_vulkan.h
+++ b/src/layer/vulkan/convolution_vulkan.h
@@ -23,6 +23,13 @@ class Convolution_vulkan : public Convolution
     using Convolution::forward;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int upload_model_int8(VkTransfer& cmd, const Option& opt);
+    int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+#endif
+
 public:
     ncnn::Layer* padding;
 
@@ -35,7 +42,6 @@ class Convolution_vulkan : public Convolution
 
     Pipeline* pipeline_convolution;
     Pipeline* pipeline_convolution_1x1s1d1;
-
     Pipeline* pipeline_convolution_gemm;
 
     // winograd23 and winograd43
@@ -64,6 +70,26 @@ class Convolution_vulkan : public Convolution
     int UNROLL_SG_K;
     int UNROLL_WG_M;
     int UNROLL_WG_N;
+
+#if NCNN_INT8
+    ncnn::Layer* quantize;
+
+    VkMat weight_data_gpu_tm_winograd23_int8_cm;
+    VkMat weight_data_gpu_tm_winograd43_int8_cm;
+    Pipeline* pipeline_convolution_3x3s1d1_winograd23_gemm_int8_cm;
+    Pipeline* pipeline_convolution_3x3s1d1_winograd43_gemm_int8_cm;
+
+    Mat weight_data_int8_packed;
+    Mat weight_winograd23_data_int8_packed;
+    Mat weight_winograd23_data_int8_packed_cm;
+    Mat weight_winograd43_data_int8_packed;
+    Mat weight_winograd43_data_int8_packed_cm;
+    Mat weight_data_int8_descales;
+    Mat bias_data_int8_packed;
+
+    VkMat weight_data_int8_descales_gpu;
+    VkMat top_blob_int8_scales_gpu;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 44e5976007c8..c41606fa0e54 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -5,6 +5,9 @@
 
 #include "layer_shader_type.h"
 #include "layer_type.h"
+#include "modelbin.h"
+
+#include <string.h>
 
 namespace ncnn {
 
@@ -22,6 +25,10 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
     pipeline_convolutiondepthwise_group_pack4 = 0;
     pipeline_convolutiondepthwise_group_pack1to4 = 0;
     pipeline_convolutiondepthwise_group_pack4to1 = 0;
+
+#if NCNN_INT8
+    quantize = 0;
+#endif
 }
 
 int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
@@ -33,11 +40,26 @@ int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
         support_vulkan = false;
     }
 
+#if NCNN_INT8
+    if (int8_scale_term && pad_value != 0.f)
+    {
+        NCNN_LOGE("ConvolutionDepthWise_vulkan int8 nonzero pad value is not supported");
+        support_vulkan = false;
+    }
+#endif
+
     return ret;
 }
 
 int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return create_pipeline_int8(opt);
+    }
+#endif
+
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
 
@@ -310,11 +332,27 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
     delete pipeline_convolutiondepthwise_group_pack4to1;
     pipeline_convolutiondepthwise_group_pack4to1 = 0;
 
+#if NCNN_INT8
+    if (quantize)
+    {
+        quantize->destroy_pipeline(opt);
+        delete quantize;
+        quantize = 0;
+    }
+#endif
+
     return 0;
 }
 
 int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return upload_model_int8(cmd, opt);
+    }
+#endif
+
     if (padding)
     {
         padding->upload_model(cmd, opt);
@@ -349,6 +387,13 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt
 
 int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return forward_int8(bottom_blob, top_blob, cmd, opt);
+    }
+#endif
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
@@ -539,4 +584,801 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
     return 0;
 }
 
+#if NCNN_INT8
+int ConvolutionDepthWise_vulkan::create_pipeline_int8(const Option& opt)
+{
+    Mat shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    Mat out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    if (shape.dims != 3) shape = Mat();
+    if (out_shape.dims != 3) out_shape = Mat();
+
+    const int maxk = kernel_w * kernel_h;
+    if (group == 0 || num_output % group != 0)
+    {
+        NCNN_LOGE("ConvolutionDepthWise_vulkan int8 invalid group");
+        return -1;
+    }
+
+    const int num_output_g = num_output / group;
+    const int weight_data_size_g = group * maxk * num_output_g;
+    if (weight_data_size_g == 0 || weight_data_size % weight_data_size_g != 0)
+    {
+        NCNN_LOGE("ConvolutionDepthWise_vulkan int8 weight shape mismatch");
+        return -1;
+    }
+
+    int channels = weight_data_size / weight_data_size_g * group;
+    const bool is_depthwise = channels == group && group == num_output;
+    const int channels_g = channels / group;
+    const int elempack = is_depthwise && opt.use_packing_layout && group % 4 == 0 ? 4 : 1;
+    const int elempack_g = !is_depthwise && opt.use_packing_layout && channels_g % 4 == 0 ? 4 : 1;
+    const int out_elempack_g = !is_depthwise && opt.use_packing_layout && num_output_g % 4 == 0 ? 4 : 1;
+    const int bottom_elempack = is_depthwise ? elempack : elempack_g;
+    const int num_output_g_pack4 = (num_output_g + 3) / 4;
+    const int num_output_g_pack4_aligned = (num_output_g_pack4 + 7) / 8 * 8;
+
+    if (weight_data.elemsize != (size_t)1u)
+    {
+        NCNN_LOGE("ConvolutionDepthWise_vulkan int8 weight data is not int8");
+        return -1;
+    }
+
+    Option opt_int8 = opt;
+    opt_int8.use_fp16_arithmetic = false;
+    opt_int8.use_int16_packed = false;
+    opt_int8.use_int16_storage = false;
+
+    Mat shape_int8;
+    if (shape.dims == 3)
+    {
+        shape_int8 = Mat(shape.w, shape.h, channels / bottom_elempack, (void*)0, (size_t)bottom_elempack, bottom_elempack);
+    }
+
+    Mat shape_int8_bordered;
+    if (shape_int8.dims == 3)
+    {
+        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+        {
+            shape_int8_bordered = Mat(shape_int8.w + pad_left + pad_right, shape_int8.h + pad_top + pad_bottom, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack);
+        }
+        else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+                 || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
+        {
+            const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+            const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+            int wpad = kernel_extent_w + (shape_int8.w - 1) / stride_w * stride_w - shape_int8.w;
+            int hpad = kernel_extent_h + (shape_int8.h - 1) / stride_h * stride_h - shape_int8.h;
+            if (wpad > 0 || hpad > 0)
+            {
+                shape_int8_bordered = Mat(shape_int8.w + wpad, shape_int8.h + hpad, shape_int8.c, (void*)0, shape_int8.elemsize, shape_int8.elempack);
+            }
+            else
+            {
+                shape_int8_bordered = shape_int8;
+            }
+        }
+        else
+        {
+            shape_int8_bordered = shape_int8;
+        }
+    }
+
+    Mat shape_padding_int8_bordered;
+    if (shape_int8_bordered.dims == 3)
+    {
+        const int padding_outc = shape_int8_bordered.c * shape_int8_bordered.elempack;
+        const int padding_out_elempack = padding_outc % 4 == 0 ? 4 : 1;
+        const size_t padding_out_elemsize = shape_int8_bordered.elemsize / shape_int8_bordered.elempack * padding_out_elempack;
+        shape_padding_int8_bordered = Mat(shape_int8_bordered.w, shape_int8_bordered.h, padding_outc / padding_out_elempack, (void*)0, padding_out_elemsize, padding_out_elempack);
+    }
+
+    {
+        quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize);
+        quantize->vkdev = vkdev;
+
+        Mat shape_quantize;
+        Mat out_shape_quantize;
+        if (shape.dims == 3)
+        {
+            size_t shape_elemsize = shape.elemsize;
+            if (shape.elempack != bottom_elempack)
+                shape_elemsize = shape.elemsize / shape.elempack * bottom_elempack;
+
+            shape_quantize = Mat(shape.w, shape.h, channels / bottom_elempack, (void*)0, shape_elemsize, bottom_elempack);
+            out_shape_quantize = shape_int8;
+        }
+
+        quantize->bottom_shapes.resize(1);
+        quantize->bottom_shapes[0] = shape_quantize;
+        quantize->top_shapes.resize(1);
+        quantize->top_shapes[0] = out_shape_quantize;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 1);
+        quantize->load_param(pd);
+
+        Mat weights[1];
+        weights[0] = bottom_blob_int8_scales;
+        quantize->load_model(ModelBinFromMatArray(weights));
+
+        quantize->create_pipeline(opt_int8);
+    }
+
+    {
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
+        padding->vkdev = vkdev;
+
+        padding->bottom_shapes.resize(1);
+        padding->bottom_shapes[0] = shape_int8;
+        padding->top_shapes.resize(1);
+        padding->top_shapes[0] = shape_padding_int8_bordered;
+
+        ncnn::ParamDict pd;
+        pd.set(0, pad_top);
+        pd.set(1, pad_bottom);
+        pd.set(2, pad_left);
+        pd.set(3, pad_right);
+        pd.set(4, 0);
+        pd.set(5, 0.f);
+
+        padding->load_param(pd);
+
+        padding->create_pipeline(opt);
+    }
+
+    if (is_depthwise)
+    {
+        const int maxk4 = (maxk + 3) / 4 * 4;
+
+        if (elempack == 4)
+        {
+            const Mat weight_data_r2 = weight_data.reshape(maxk, group);
+
+            weight_data_int8_packed.create(maxk4 / 4, group / 4, (size_t)16u, 1);
+            memset(weight_data_int8_packed.data, 0, weight_data_int8_packed.total() * weight_data_int8_packed.elemsize);
+
+            for (int q = 0; q + 3 < group; q += 4)
+            {
+                signed char* g00 = weight_data_int8_packed.row<signed char>(q / 4);
+                const signed char* k0 = weight_data_r2.row<const signed char>(q);
+                const signed char* k1 = weight_data_r2.row<const signed char>(q + 1);
+                const signed char* k2 = weight_data_r2.row<const signed char>(q + 2);
+                const signed char* k3 = weight_data_r2.row<const signed char>(q + 3);
+
+                for (int k = 0; k < maxk4; k += 4)
+                {
+                    signed char* g0 = g00 + k * 4;
+                    signed char* g1 = g0 + 4;
+                    signed char* g2 = g1 + 4;
+                    signed char* g3 = g2 + 4;
+
+                    for (int i = 0; i < 4 && k + i < maxk; i++)
+                    {
+                        g0[i] = k0[k + i];
+                        g1[i] = k1[k + i];
+                        g2[i] = k2[k + i];
+                        g3[i] = k3[k + i];
+                    }
+                }
+            }
+        }
+        else
+        {
+            const Mat weight_data_r2 = weight_data.reshape(maxk, group);
+
+            weight_data_int8_packed.create(maxk4 / 4, group, (size_t)4u, 4);
+            weight_data_int8_packed.fill(0);
+
+            for (int q = 0; q < group; q++)
+            {
+                const signed char* k0 = weight_data_r2.row<const signed char>(q);
+                signed char* g00 = weight_data_int8_packed.row<signed char>(q);
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    g00[k] = k0[k];
+                }
+            }
+        }
+    }
+    else
+    {
+        const Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);
+
+        if (elempack_g == 4)
+        {
+            weight_data_int8_packed.create(maxk, channels_g / 4, num_output_g_pack4_aligned * group, (size_t)16u, 16);
+            weight_data_int8_packed.fill(0);
+
+            for (int g = 0; g < group; g++)
+            {
+                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
+
+                for (int q = 0; q < num_output_g; q += 4)
+                {
+                    const int outch_pack = std::min(4, num_output_g - q);
+                    Mat weight_data_packed = weight_data_int8_packed.channel(g * num_output_g_pack4_aligned + q / 4);
+
+                    for (int p = 0; p < channels_g; p += 4)
+                    {
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            signed char* g00 = weight_data_packed.row<signed char>(p / 4) + k * 16;
+
+                            for (int i = 0; i < outch_pack; i++)
+                            {
+                                const signed char* k0 = weight_data_r2.channel(q + i).row<const signed char>(p);
+                                const signed char* k1 = weight_data_r2.channel(q + i).row<const signed char>(p + 1);
+                                const signed char* k2 = weight_data_r2.channel(q + i).row<const signed char>(p + 2);
+                                const signed char* k3 = weight_data_r2.channel(q + i).row<const signed char>(p + 3);
+
+                                g00[i * 4 + 0] = k0[k];
+                                g00[i * 4 + 1] = k1[k];
+                                g00[i * 4 + 2] = k2[k];
+                                g00[i * 4 + 3] = k3[k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            weight_data_int8_packed.create(maxk * channels_g, num_output_g_pack4_aligned * group, (size_t)4u, 4);
+            weight_data_int8_packed.fill(0);
+
+            for (int g = 0; g < group; g++)
+            {
+                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
+
+                for (int q = 0; q < num_output_g; q += 4)
+                {
+                    const int outch_pack = std::min(4, num_output_g - q);
+                    signed char* g00 = weight_data_int8_packed.row<signed char>(g * num_output_g_pack4_aligned + q / 4);
+
+                    for (int p = 0; p < channels_g; p++)
+                    {
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            signed char* g00p = g00 + (p * maxk + k) * 4;
+
+                            for (int i = 0; i < outch_pack; i++)
+                            {
+                                const signed char* k0 = weight_data_r2.channel(q + i).row<const signed char>(p);
+                                g00p[i] = k0[k];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    const bool use_int8_requantize = int8_scale_term > 100;
+
+    if (is_depthwise)
+    {
+        if (elempack == 4)
+        {
+            weight_data_int8_descales.create(group / 4, (size_t)16u, 4);
+
+            float* outptr = weight_data_int8_descales;
+            for (int g = 0; g < group; g++)
+            {
+                const float bottom_scale = bottom_blob_int8_scales[g];
+                const float weight_scale = weight_data_int8_scales[g];
+                outptr[g] = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale);
+            }
+        }
+        else
+        {
+            weight_data_int8_descales.create(group, (size_t)4u, 1);
+
+            float* outptr = weight_data_int8_descales;
+            for (int g = 0; g < group; g++)
+            {
+                const float bottom_scale = bottom_blob_int8_scales[g];
+                const float weight_scale = weight_data_int8_scales[g];
+                outptr[g] = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale);
+            }
+        }
+    }
+    else
+    {
+        weight_data_int8_descales.create(num_output_g_pack4_aligned * group, (size_t)16u, 4);
+        weight_data_int8_descales.fill(0.f);
+
+        float* outptr = weight_data_int8_descales;
+        for (int g = 0; g < group; g++)
+        {
+            const float bottom_scale = bottom_blob_int8_scales[g];
+            const float weight_scale = weight_data_int8_scales[g];
+            const float descale = bottom_scale == 0.f || weight_scale == 0.f ? 0.f : 1.f / (bottom_scale * weight_scale);
+
+            for (int q = 0; q < num_output_g; q++)
+            {
+                outptr[g * num_output_g_pack4_aligned * 4 + q] = descale;
+            }
+        }
+    }
+
+    if (use_int8_requantize)
+    {
+        if (is_depthwise)
+        {
+            if (elempack == 4)
+            {
+                top_blob_int8_scales_packed.create(group / 4, (size_t)16u, 4);
+
+                float* outptr = top_blob_int8_scales_packed;
+                for (int g = 0; g < group; g++)
+                {
+                    outptr[g] = top_blob_int8_scales[g];
+                }
+            }
+            else
+            {
+                top_blob_int8_scales_packed = top_blob_int8_scales;
+            }
+        }
+        else
+        {
+            top_blob_int8_scales_packed.create(num_output_g_pack4_aligned * group, (size_t)16u, 4);
+            top_blob_int8_scales_packed.fill(0.f);
+
+            float* outptr = top_blob_int8_scales_packed;
+            for (int g = 0; g < group; g++)
+            {
+                const float top_scale = top_blob_int8_scales[g];
+
+                for (int q = 0; q < num_output_g; q++)
+                {
+                    outptr[g * num_output_g_pack4_aligned * 4 + q] = top_scale;
+                }
+            }
+        }
+    }
+
+    if (bias_term)
+    {
+        if (is_depthwise)
+        {
+            if (elempack == 4)
+            {
+                bias_data_int8_packed.create(num_output / 4, (size_t)16u, 4);
+                bias_data_int8_packed.fill(0.f);
+
+                float* outptr = bias_data_int8_packed;
+                for (int q = 0; q < num_output; q++)
+                {
+                    outptr[q] = bias_data[q];
+                }
+            }
+            else
+            {
+                bias_data_int8_packed = bias_data;
+            }
+        }
+        else
+        {
+            bias_data_int8_packed.create(num_output_g_pack4_aligned * group, (size_t)16u, 4);
+            bias_data_int8_packed.fill(0.f);
+
+            float* outptr = bias_data_int8_packed;
+            for (int q = 0; q < num_output; q++)
+            {
+                const int g = q / num_output_g;
+                const int qg = q - g * num_output_g;
+                outptr[g * num_output_g_pack4_aligned * 4 + qg] = bias_data[q];
+            }
+        }
+    }
+
+    const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
+    const bool use_sfp_output = !use_int8_requantize && (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed);
+    size_t out_elemsize;
+    if (use_int8_requantize)
+    {
+        out_elemsize = out_elempack;
+    }
+    else if (use_sfp_output)
+    {
+        out_elemsize = (size_t)2u * out_elempack;
+    }
+    else
+    {
+        out_elemsize = (size_t)4u * out_elempack;
+    }
+
+    size_t out_elemsize_g;
+    if (use_int8_requantize)
+    {
+        out_elemsize_g = out_elempack_g;
+    }
+    else if (use_sfp_output)
+    {
+        out_elemsize_g = (size_t)2u * out_elempack_g;
+    }
+    else
+    {
+        out_elemsize_g = (size_t)4u * out_elempack_g;
+    }
+
+    Mat out_shape_int8;
+    if (out_shape.dims == 3)
+        out_shape_int8 = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    Mat out_shape_int8_g;
+    if (out_shape.dims == 3)
+        out_shape_int8_g = Mat(out_shape.w, out_shape.h, num_output / out_elempack_g, (void*)0, out_elemsize_g, out_elempack_g);
+
+    std::vector<vk_specialization_type> specializations(12 + 10);
+    specializations[0].i = kernel_w;
+    specializations[1].i = kernel_h;
+    specializations[2].i = dilation_w;
+    specializations[3].i = dilation_h;
+    specializations[4].i = stride_w;
+    specializations[5].i = stride_h;
+    specializations[6].i = bias_term;
+    specializations[7].i = group;
+    specializations[8].i = activation_type;
+    specializations[9].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[10].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[11].i = use_int8_requantize ? 1 : 0;
+
+    if (is_depthwise)
+    {
+        specializations[12 + 0].i = shape_int8_bordered.dims;
+        specializations[12 + 1].i = shape_int8_bordered.w;
+        specializations[12 + 2].i = shape_int8_bordered.h;
+        specializations[12 + 3].i = shape_int8_bordered.c;
+        specializations[12 + 4].i = shape_int8_bordered.cstep;
+        specializations[12 + 5].i = out_shape_int8.dims;
+        specializations[12 + 6].i = out_shape_int8.w;
+        specializations[12 + 7].i = out_shape_int8.h;
+        specializations[12 + 8].i = out_shape_int8.c;
+        specializations[12 + 9].i = out_shape_int8.cstep;
+
+        Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
+        if (out_shape.dims != 0)
+        {
+            local_size_xyz.w = std::min(8, out_shape.w);
+            local_size_xyz.h = std::min(8, out_shape.h);
+            local_size_xyz.c = std::min(4, out_shape_int8.c);
+        }
+
+        if (opt.use_packing_layout && group % 4 == 0)
+        {
+            pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
+            pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4_int8, opt_int8, specializations);
+        }
+        else
+        {
+            pipeline_convolutiondepthwise = new Pipeline(vkdev);
+            pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise_int8, opt_int8, specializations);
+        }
+    }
+    else
+    {
+        std::vector<vk_specialization_type> specializations_group(15 + 10);
+        for (int i = 0; i < 12; i++)
+        {
+            specializations_group[i] = specializations[i];
+        }
+        specializations_group[12].i = elempack_g;
+        specializations_group[13].i = out_elempack_g;
+        specializations_group[14].i = num_output_g;
+        for (int i = 0; i < 10; i++)
+        {
+            specializations_group[15 + i].i = 0;
+        }
+
+        Mat local_size_xyz(8, 8, 1, (void*)0);
+        if (out_shape.dims != 0)
+        {
+            local_size_xyz.w = std::min(8, out_shape.w);
+            local_size_xyz.h = std::min(8, out_shape.h);
+            local_size_xyz.c = std::min(4, group * num_output_g_pack4);
+        }
+
+        pipeline_convolutiondepthwise_group = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise_group->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise_group->create(LayerShaderType::convolutiondepthwise_group_packed_int8, opt_int8, specializations_group);
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt)
+{
+    Option opt_fp32 = opt;
+    opt_fp32.use_fp16_packed = false;
+    opt_fp32.use_fp16_storage = false;
+    opt_fp32.use_bf16_packed = false;
+    opt_fp32.use_bf16_storage = false;
+
+    cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt);
+
+    weight_data_int8_packed.release();
+
+    cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt_fp32);
+
+    weight_data_int8_descales.release();
+
+    const bool use_int8_requantize = int8_scale_term > 100;
+    if (use_int8_requantize)
+    {
+        cmd.record_upload(top_blob_int8_scales_packed, top_blob_int8_scales_gpu, opt);
+
+        top_blob_int8_scales_packed.release();
+    }
+
+    if (bias_term)
+    {
+        cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt_fp32);
+
+        bias_data_int8_packed.release();
+
+        bias_data.release();
+    }
+
+    if (padding)
+    {
+        padding->upload_model(cmd, opt);
+    }
+
+    quantize->upload_model(cmd, opt);
+
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    VkMat bottom = bottom_blob;
+    bool bottom_is_int8 = bottom.elembits() == 8;
+
+    int channels = bottom.c * bottom.elempack;
+    const bool is_depthwise = channels == group && group == num_output;
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+    const int elempack = is_depthwise && opt.use_packing_layout && channels % 4 == 0 ? 4 : 1;
+    const int elempack_g = !is_depthwise && opt.use_packing_layout && channels_g % 4 == 0 ? 4 : 1;
+    const int out_elempack_g = !is_depthwise && opt.use_packing_layout && num_output_g % 4 == 0 ? 4 : 1;
+    const int bottom_elempack = is_depthwise ? elempack : elempack_g;
+
+    Option opt_workspace = opt;
+    opt_workspace.blob_vkallocator = opt.workspace_vkallocator;
+    opt_workspace.use_fp16_arithmetic = false;
+
+    if (bottom.elempack != bottom_elempack)
+    {
+        VkMat bottom_unpacked;
+        vkdev->convert_packing(bottom, bottom_unpacked, bottom_elempack, cmd, opt_workspace);
+        bottom = bottom_unpacked;
+    }
+
+    if (!bottom_is_int8)
+    {
+        VkMat bottom_int8;
+        int ret = quantize->forward(bottom, bottom_int8, cmd, opt_workspace);
+        if (ret != 0)
+            return ret;
+
+        bottom = bottom_int8;
+        bottom_is_int8 = true;
+    }
+
+    int w = bottom.w;
+    int h = bottom.h;
+    channels = bottom.c * bottom.elempack;
+
+    if (channels % group != 0)
+    {
+        NCNN_LOGE("ConvolutionDepthWise_vulkan int8 input channels mismatch");
+        return -1;
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+    {
+        VkMat bottom_bordered;
+        int ret = padding->forward(bottom, bottom_bordered, cmd, opt_workspace);
+        if (ret != 0)
+            return ret;
+
+        bottom = bottom_bordered;
+        w = bottom.w;
+        h = bottom.h;
+    }
+    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = hpad / 2;
+            padding_params[1] = hpad - hpad / 2;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_workspace);
+            if (ret != 0)
+                return ret;
+
+            bottom = padding_outputs[0];
+            w = bottom.w;
+            h = bottom.h;
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
+        if (wpad > 0 || hpad > 0)
+        {
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = hpad - hpad / 2;
+            padding_params[1] = hpad / 2;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            int ret = padding->forward(padding_inputs, padding_outputs, cmd, opt_workspace);
+            if (ret != 0)
+                return ret;
+
+            bottom = padding_outputs[0];
+            w = bottom.w;
+            h = bottom.h;
+        }
+    }
+
+    if (bottom.elempack != bottom_elempack)
+    {
+        VkMat bottom_unpacked;
+        vkdev->convert_packing(bottom, bottom_unpacked, bottom_elempack, cmd, opt_workspace);
+        bottom = bottom_unpacked;
+        w = bottom.w;
+        h = bottom.h;
+    }
+
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+    const int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    const bool use_int8_requantize = int8_scale_term > 100;
+    const int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
+    const bool use_sfp_output = !use_int8_requantize && (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed);
+    size_t out_elemsize;
+    if (use_int8_requantize)
+    {
+        out_elemsize = out_elempack;
+    }
+    else if (use_sfp_output)
+    {
+        out_elemsize = (size_t)2u * out_elempack;
+    }
+    else
+    {
+        out_elemsize = (size_t)4u * out_elempack;
+    }
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    VkMat top_blob_unpacked = top_blob;
+    if (!is_depthwise && out_elempack_g != out_elempack)
+    {
+        size_t out_elemsize_g;
+        if (use_int8_requantize)
+        {
+            out_elemsize_g = out_elempack_g;
+        }
+        else if (use_sfp_output)
+        {
+            out_elemsize_g = (size_t)2u * out_elempack_g;
+        }
+        else
+        {
+            out_elemsize_g = (size_t)4u * out_elempack_g;
+        }
+
+        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom.dims;
+    constants[1].i = bottom.w;
+    constants[2].i = bottom.h;
+    constants[3].i = bottom.c;
+    constants[4].i = bottom.cstep;
+    constants[5].i = top_blob_unpacked.dims;
+    constants[6].i = top_blob_unpacked.w;
+    constants[7].i = top_blob_unpacked.h;
+    constants[8].i = top_blob_unpacked.c;
+    constants[9].i = top_blob_unpacked.cstep;
+
+    if (is_depthwise)
+    {
+        std::vector<VkMat> bindings(7);
+        bindings[0] = bottom;
+        bindings[1] = top_blob_unpacked;
+        bindings[2] = weight_data_gpu;
+        bindings[3] = bias_data_gpu;
+        bindings[4] = weight_data_int8_descales_gpu;
+        bindings[5] = top_blob_int8_scales_gpu;
+        // binding 6 aliases top with int8 SSBO element type
+        bindings[6] = top_blob_unpacked;
+
+        const Pipeline* pipeline = bottom.elempack == 4 ? pipeline_convolutiondepthwise_pack4 : pipeline_convolutiondepthwise;
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+    else
+    {
+        std::vector<VkMat> bindings(10);
+        bindings[0] = bottom;
+        bindings[1] = top_blob_unpacked;
+        bindings[2] = bottom;
+        bindings[3] = top_blob_unpacked;
+        bindings[4] = weight_data_gpu;
+        bindings[5] = bias_data_gpu;
+        bindings[6] = weight_data_int8_descales_gpu;
+        bindings[7] = top_blob_int8_scales_gpu;
+        bindings[8] = top_blob_unpacked;
+        bindings[9] = top_blob_unpacked;
+
+        const int num_output_g_pack4 = (num_output_g + 3) / 4;
+
+        std::vector<vk_constant_type> constants_group = constants;
+        constants_group[8].i = num_output_g_pack4 * group;
+        constants_group[9].i = out_elempack_g == 4 ? top_blob_unpacked.cstep : top_blob_unpacked.cstep * 4;
+
+        VkMat dispatcher;
+        dispatcher.w = top_blob_unpacked.w;
+        dispatcher.h = top_blob_unpacked.h;
+        dispatcher.c = group * num_output_g_pack4;
+
+        cmd.record_pipeline(pipeline_convolutiondepthwise_group, bindings, constants_group, dispatcher);
+
+        if (out_elempack_g != out_elempack)
+        {
+            vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h
index da22c82097a0..cf110296ef0a 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h
@@ -23,6 +23,13 @@ class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
     using ConvolutionDepthWise::forward;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int upload_model_int8(VkTransfer& cmd, const Option& opt);
+    int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+#endif
+
 public:
     Mat weight_data_packed;
     Mat weight_data_packed_groups;
@@ -34,11 +41,22 @@ class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
 
     Pipeline* pipeline_convolutiondepthwise;
     Pipeline* pipeline_convolutiondepthwise_pack4;
-
     Pipeline* pipeline_convolutiondepthwise_group;
     Pipeline* pipeline_convolutiondepthwise_group_pack4;
     Pipeline* pipeline_convolutiondepthwise_group_pack1to4;
     Pipeline* pipeline_convolutiondepthwise_group_pack4to1;
+
+#if NCNN_INT8
+    ncnn::Layer* quantize;
+
+    Mat weight_data_int8_packed;
+    Mat weight_data_int8_descales;
+    Mat top_blob_int8_scales_packed;
+    Mat bias_data_int8_packed;
+
+    VkMat weight_data_int8_descales_gpu;
+    VkMat top_blob_int8_scales_gpu;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/flatten_vulkan.cpp b/src/layer/vulkan/flatten_vulkan.cpp
index 40483bd670b2..50ec77e7cccb 100644
--- a/src/layer/vulkan/flatten_vulkan.cpp
+++ b/src/layer/vulkan/flatten_vulkan.cpp
@@ -15,6 +15,11 @@ Flatten_vulkan::Flatten_vulkan()
     pipeline_flatten = 0;
     pipeline_flatten_pack4 = 0;
     pipeline_flatten_pack1to4 = 0;
+#if NCNN_INT8
+    pipeline_flatten_int8 = 0;
+    pipeline_flatten_pack4_int8 = 0;
+    pipeline_flatten_pack1to4_int8 = 0;
+#endif
 }
 
 int Flatten_vulkan::create_pipeline(const Option& opt)
@@ -42,6 +47,34 @@ int Flatten_vulkan::create_pipeline(const Option& opt)
         local_size_xyz.c = 1;
     }
 
+#if NCNN_INT8
+    Mat shape_int8;
+    if (shape.dims == 1) shape_int8 = Mat(shape.w, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 2) shape_int8 = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 3) shape_int8 = Mat(shape.w, shape.h, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 4) shape_int8 = Mat(shape.w, shape.h, shape.d, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack);
+
+    Mat out_shape_int8;
+    if (out_shape.dims == 1) out_shape_int8 = Mat(out_shape.w, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 2) out_shape_int8 = Mat(out_shape.w, out_shape.h, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 3) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 4) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+
+    std::vector<vk_specialization_type> specializations_int8 = specializations;
+    specializations_int8[0 + 0].i = std::min(3, shape_int8.dims);
+    specializations_int8[0 + 1].i = shape_int8.w;
+    specializations_int8[0 + 2].i = shape_int8.h * shape_int8.d;
+    specializations_int8[0 + 3].i = shape_int8.c;
+    specializations_int8[0 + 4].i = shape_int8.cstep;
+    specializations_int8[0 + 5].i = std::min(3, out_shape_int8.dims);
+    specializations_int8[0 + 6].i = out_shape_int8.w;
+    specializations_int8[0 + 7].i = out_shape_int8.h * out_shape_int8.d;
+    specializations_int8[0 + 8].i = out_shape_int8.c;
+    specializations_int8[0 + 9].i = out_shape_int8.cstep;
+
+    const bool use_int8_pipeline = opt.use_int8_packed || opt.use_int8_storage;
+#endif
+
     // pack1
     if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 1))
     {
@@ -66,6 +99,32 @@ int Flatten_vulkan::create_pipeline(const Option& opt)
         pipeline_flatten_pack1to4->create(LayerShaderType::flatten_pack1to4, opt, specializations);
     }
 
+#if NCNN_INT8
+    if (use_int8_pipeline)
+    {
+        if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 1))
+        {
+            pipeline_flatten_int8 = new Pipeline(vkdev);
+            pipeline_flatten_int8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_flatten_int8->create(LayerShaderType::flatten_int8, opt, specializations_int8);
+        }
+
+        if (shape.dims == 0 || (shape.elempack == 4 && out_shape.elempack == 4))
+        {
+            pipeline_flatten_pack4_int8 = new Pipeline(vkdev);
+            pipeline_flatten_pack4_int8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_flatten_pack4_int8->create(LayerShaderType::flatten_pack4_int8, opt, specializations_int8);
+        }
+
+        if (shape.dims == 0 || (shape.elempack == 1 && out_shape.elempack == 4))
+        {
+            pipeline_flatten_pack1to4_int8 = new Pipeline(vkdev);
+            pipeline_flatten_pack1to4_int8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_flatten_pack1to4_int8->create(LayerShaderType::flatten_pack1to4_int8, opt, specializations_int8);
+        }
+    }
+#endif
+
     return 0;
 }
 
@@ -80,6 +139,17 @@ int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_flatten_pack1to4;
     pipeline_flatten_pack1to4 = 0;
 
+#if NCNN_INT8
+    delete pipeline_flatten_int8;
+    pipeline_flatten_int8 = 0;
+
+    delete pipeline_flatten_pack4_int8;
+    pipeline_flatten_pack4_int8 = 0;
+
+    delete pipeline_flatten_pack1to4_int8;
+    pipeline_flatten_pack1to4_int8 = 0;
+#endif
+
     return 0;
 }
 
@@ -138,7 +208,28 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
     constants[9].i = top_blob.cstep;
 
     const Pipeline* pipeline = 0;
+#if NCNN_INT8
+    if (bottom_blob.elembits() == 8 && elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_flatten_int8;
+    }
+    else if (bottom_blob.elembits() == 8 && elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack4_int8;
+    }
+    else if (bottom_blob.elembits() == 8 && elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack1to4_int8;
+    }
+    else if (elempack == 1 && out_elempack == 1)
+#else
+    if (bottom_blob.elembits() == 8)
+    {
+        return -1;
+    }
+
     if (elempack == 1 && out_elempack == 1)
+#endif
     {
         pipeline = pipeline_flatten;
     }
@@ -151,6 +242,9 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
         pipeline = pipeline_flatten_pack1to4;
     }
 
+    if (!pipeline)
+        return -1;
+
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
     return 0;
diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h
index e3425385ba99..880a3a31a41e 100644
--- a/src/layer/vulkan/flatten_vulkan.h
+++ b/src/layer/vulkan/flatten_vulkan.h
@@ -23,6 +23,11 @@ class Flatten_vulkan : public Flatten
     Pipeline* pipeline_flatten;
     Pipeline* pipeline_flatten_pack4;
     Pipeline* pipeline_flatten_pack1to4;
+#if NCNN_INT8
+    Pipeline* pipeline_flatten_int8;
+    Pipeline* pipeline_flatten_pack4_int8;
+    Pipeline* pipeline_flatten_pack1to4_int8;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp
index 682729c95550..dfc0bb250270 100644
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -7,6 +7,16 @@
 
 namespace ncnn {
 
+#if NCNN_INT8
+static inline signed char float2int8(float v)
+{
+    int int32 = static_cast<int>(round(v));
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+#endif // NCNN_INT8
+
 Gemm_vulkan::Gemm_vulkan()
 {
     support_vulkan = true;
@@ -14,6 +24,12 @@ Gemm_vulkan::Gemm_vulkan()
     support_vulkan_any_packing = true;
 
     pipeline_gemm = 0;
+#if NCNN_INT8
+    pipeline_gemm_quantize_A_int8 = 0;
+    pipeline_gemm_quantize_B_absmax_int8 = 0;
+    pipeline_gemm_quantize_B_descale_int8 = 0;
+    pipeline_gemm_quantize_B_int8 = 0;
+#endif
 
     use_subgroup_ops = false;
 
@@ -29,20 +45,15 @@ Gemm_vulkan::Gemm_vulkan()
     UNROLL_WG_N = 1;
 }
 
-int Gemm_vulkan::load_param(const ParamDict& pd)
+int Gemm_vulkan::create_pipeline(const Option& opt)
 {
-    int ret = Gemm::load_param(pd);
-
+#if NCNN_INT8
     if (int8_scale_term)
     {
-        support_vulkan = false;
+        return create_pipeline_int8(opt);
     }
+#endif
 
-    return ret;
-}
-
-int Gemm_vulkan::create_pipeline(const Option& opt)
-{
     // const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0];
 
     if (constantA)
@@ -599,6 +610,20 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_gemm;
     pipeline_gemm = 0;
 
+#if NCNN_INT8
+    delete pipeline_gemm_quantize_A_int8;
+    pipeline_gemm_quantize_A_int8 = 0;
+
+    delete pipeline_gemm_quantize_B_absmax_int8;
+    pipeline_gemm_quantize_B_absmax_int8 = 0;
+
+    delete pipeline_gemm_quantize_B_descale_int8;
+    pipeline_gemm_quantize_B_descale_int8 = 0;
+
+    delete pipeline_gemm_quantize_B_int8;
+    pipeline_gemm_quantize_B_int8 = 0;
+#endif
+
     use_subgroup_ops = false;
 
     use_cooperative_matrix = false;
@@ -617,6 +642,13 @@ int Gemm_vulkan::destroy_pipeline(const Option& /*opt*/)
 
 int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return upload_model_int8(cmd, opt);
+    }
+#endif
+
     if (constantA)
     {
         cmd.record_upload(A_data_packed, A_data_gpu, opt);
@@ -643,6 +675,13 @@ int Gemm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
 int Gemm_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return forward_int8(bottom_blobs, top_blobs, cmd, opt);
+    }
+#endif
+
     const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0];
     const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1];
 
@@ -865,4 +904,749 @@ int Gemm_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
     return ret;
 }
 
+#if NCNN_INT8
+int Gemm_vulkan::create_pipeline_int8(const Option& opt)
+{
+    Option opt_int8 = opt;
+    opt_int8.use_fp16_arithmetic = false;
+    opt_int8.use_int16_packed = false;
+    opt_int8.use_int16_storage = false;
+
+    coopmat_M = 0;
+    coopmat_N = 0;
+    coopmat_K = 0;
+    coopmat_subgroup_size = 0;
+
+    use_cooperative_matrix = vkdev->info.support_int8_cooperative_matrix() && opt.use_cooperative_matrix && opt.use_int8_arithmetic;
+    if (use_cooperative_matrix)
+    {
+        int M = constantM ? constantM : 1024;
+        int N = constantN ? constantN : 1024;
+        int K = constantK ? constantK : 1024;
+
+        vkdev->info.get_optimal_cooperative_matrix_mnk(M, N, K, VK_COMPONENT_TYPE_SINT8_KHR, VK_COMPONENT_TYPE_SINT32_KHR, VK_SCOPE_SUBGROUP_KHR, coopmat_M, coopmat_N, coopmat_K, coopmat_subgroup_size);
+
+        if (coopmat_M == 0 || coopmat_N == 0 || coopmat_K == 0)
+        {
+            use_cooperative_matrix = false;
+        }
+        else
+        {
+            UNROLL_SG_M = std::min((M + coopmat_M - 1) / coopmat_M, 2);
+            UNROLL_SG_N = std::min((N + coopmat_N - 1) / coopmat_N, 2);
+            UNROLL_SG_K = std::min((K + coopmat_K - 1) / coopmat_K, 2);
+
+            UNROLL_WG_M = std::min((M + coopmat_M * UNROLL_SG_M - 1) / (coopmat_M * UNROLL_SG_M), 2);
+            UNROLL_WG_N = std::min((N + coopmat_N * UNROLL_SG_N - 1) / (coopmat_N * UNROLL_SG_N), 2);
+        }
+    }
+
+    if (constantA)
+    {
+        A_data_int8_packed.create(constantK, constantM, (size_t)1u, 1);
+        if (A_data_int8_packed.empty())
+            return -100;
+
+        A_data_int8_descales.create(constantM, (size_t)4u, 1);
+        if (A_data_int8_descales.empty())
+            return -100;
+
+        if (A_data.elemsize == (size_t)1u)
+        {
+            for (int i = 0; i < constantM; i++)
+            {
+                const float scale = A_data_int8_scales[i];
+                A_data_int8_descales[i] = scale == 0.f ? 0.f : 1.f / scale;
+            }
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < constantM; i++)
+            {
+                signed char* outptr = A_data_int8_packed.row<signed char>(i);
+
+                for (int k = 0; k < constantK; k++)
+                {
+                    outptr[k] = transA ? A_data.row<const signed char>(k)[i] : A_data.row<const signed char>(i)[k];
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < constantM; i++)
+            {
+                float absmax = 0.f;
+                for (int k = 0; k < constantK; k++)
+                {
+                    const float v = transA ? A_data.row(k)[i] : A_data.row(i)[k];
+                    absmax = std::max(absmax, v < 0.f ? -v : v);
+                }
+
+                const float A_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;
+                A_data_int8_descales[i] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f);
+
+                signed char* outptr = A_data_int8_packed.row<signed char>(i);
+
+                for (int k = 0; k < constantK; k++)
+                {
+                    const float v = transA ? A_data.row(k)[i] : A_data.row(i)[k];
+                    outptr[k] = float2int8(v * A_int8_scale);
+                }
+            }
+        }
+
+        if (use_cooperative_matrix)
+        {
+            Mat A_data_int8 = A_data_int8_packed;
+
+            const int blocks_m = (constantM + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M);
+            const int kk = (constantK + coopmat_K - 1) / coopmat_K;
+
+            const int A_data_int8_packed_size = coopmat_M * coopmat_K * UNROLL_SG_M * UNROLL_WG_M * kk;
+            A_data_int8_packed.create(A_data_int8_packed_size / 4, blocks_m, (size_t)4u, 4);
+            if (A_data_int8_packed.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int bm = 0; bm < blocks_m; bm++)
+            {
+                signed char* p = A_data_int8_packed.row<signed char>(bm);
+
+                int k = 0;
+                for (; k + UNROLL_SG_K - 1 < kk; k += UNROLL_SG_K)
+                {
+                    for (int wm = 0; wm < UNROLL_WG_M; wm++)
+                    {
+                        for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                        {
+                            for (int zm = 0; zm < UNROLL_SG_M; zm++)
+                            {
+                                for (int i = 0; i < coopmat_M; i++)
+                                {
+                                    for (int j = 0; j < coopmat_K; j++)
+                                    {
+                                        const int gmi = ((bm * UNROLL_WG_M + wm) * UNROLL_SG_M + zm) * coopmat_M + i;
+                                        const int gki = (k + zk) * coopmat_K + j;
+
+                                        if (gmi < constantM && gki < constantK)
+                                        {
+                                            *p++ = A_data_int8.row<const signed char>(gmi)[gki];
+                                        }
+                                        else
+                                        {
+                                            *p++ = 0;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                for (; k < kk; k++)
+                {
+                    for (int wm = 0; wm < UNROLL_WG_M; wm++)
+                    {
+                        for (int zm = 0; zm < UNROLL_SG_M; zm++)
+                        {
+                            for (int i = 0; i < coopmat_M; i++)
+                            {
+                                for (int j = 0; j < coopmat_K; j++)
+                                {
+                                    const int gmi = ((bm * UNROLL_WG_M + wm) * UNROLL_SG_M + zm) * coopmat_M + i;
+                                    const int gki = k * coopmat_K + j;
+
+                                    if (gmi < constantM && gki < constantK)
+                                    {
+                                        *p++ = A_data_int8.row<const signed char>(gmi)[gki];
+                                    }
+                                    else
+                                    {
+                                        *p++ = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (constantB)
+    {
+        B_data_int8_packed.create(constantK, constantN, (size_t)1u, 1);
+        if (B_data_int8_packed.empty())
+            return -100;
+
+        B_data_int8_descales.create(1);
+        if (B_data_int8_descales.empty())
+            return -100;
+
+        if (B_data.elemsize == (size_t)1u)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < constantN; j++)
+            {
+                signed char* outptr = B_data_int8_packed.row<signed char>(j);
+
+                for (int k = 0; k < constantK; k++)
+                {
+                    outptr[k] = transB ? B_data.row<const signed char>(j)[k] : B_data.row<const signed char>(k)[j];
+                }
+            }
+
+            B_data_int8_descales[0] = B_data_int8_scale == 0.f ? 0.f : 1.f / B_data_int8_scale;
+        }
+        else
+        {
+            float absmax = 0.f;
+            for (int j = 0; j < constantN; j++)
+            {
+                for (int k = 0; k < constantK; k++)
+                {
+                    const float v = transB ? B_data.row(j)[k] : B_data.row(k)[j];
+                    absmax = std::max(absmax, v < 0.f ? -v : v);
+                }
+            }
+
+            const float B_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;
+            B_data_int8_descales[0] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < constantN; j++)
+            {
+                signed char* outptr = B_data_int8_packed.row<signed char>(j);
+
+                for (int k = 0; k < constantK; k++)
+                {
+                    const float v = transB ? B_data.row(j)[k] : B_data.row(k)[j];
+                    outptr[k] = float2int8(v * B_int8_scale);
+                }
+            }
+        }
+
+        if (use_cooperative_matrix)
+        {
+            Mat B_data_int8 = B_data_int8_packed;
+
+            const int blocks_n = (constantN + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+            const int kk = (constantK + coopmat_K - 1) / coopmat_K;
+
+            const int B_data_int8_packed_size = coopmat_N * coopmat_K * UNROLL_SG_N * UNROLL_WG_N * kk;
+            B_data_int8_packed.create(B_data_int8_packed_size / 4, blocks_n, (size_t)4u, 4);
+            if (B_data_int8_packed.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int bn = 0; bn < blocks_n; bn++)
+            {
+                signed char* p = B_data_int8_packed.row<signed char>(bn);
+
+                int k = 0;
+                for (; k + UNROLL_SG_K - 1 < kk; k += UNROLL_SG_K)
+                {
+                    for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                    {
+                        for (int zk = 0; zk < UNROLL_SG_K; zk++)
+                        {
+                            for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                            {
+                                for (int i = 0; i < coopmat_K; i++)
+                                {
+                                    for (int j = 0; j < coopmat_N; j++)
+                                    {
+                                        const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j;
+                                        const int gki = (k + zk) * coopmat_K + i;
+
+                                        if (gni < constantN && gki < constantK)
+                                        {
+                                            *p++ = B_data_int8.row<const signed char>(gni)[gki];
+                                        }
+                                        else
+                                        {
+                                            *p++ = 0;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                for (; k < kk; k++)
+                {
+                    for (int wn = 0; wn < UNROLL_WG_N; wn++)
+                    {
+                        for (int zn = 0; zn < UNROLL_SG_N; zn++)
+                        {
+                            for (int i = 0; i < coopmat_K; i++)
+                            {
+                                for (int j = 0; j < coopmat_N; j++)
+                                {
+                                    const int gni = ((bn * UNROLL_WG_N + wn) * UNROLL_SG_N + zn) * coopmat_N + j;
+                                    const int gki = k * coopmat_K + i;
+
+                                    if (gni < constantN && gki < constantK)
+                                    {
+                                        *p++ = B_data_int8.row<const signed char>(gni)[gki];
+                                    }
+                                    else
+                                    {
+                                        *p++ = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (constantC && constant_broadcast_type_C != -1)
+    {
+        C_data_packed = C_data;
+    }
+
+    if (!constantA)
+    {
+        std::vector<vk_specialization_type> specializations(1);
+        specializations[0].i = transA;
+
+        pipeline_gemm_quantize_A_int8 = new Pipeline(vkdev);
+        pipeline_gemm_quantize_A_int8->set_optimal_local_size_xyz(Mat(64, 1, 1, (void*)0));
+        pipeline_gemm_quantize_A_int8->create(LayerShaderType::gemm_quantize_A_int8, opt_int8, specializations);
+    }
+
+    if (!constantB)
+    {
+        std::vector<vk_specialization_type> specializations(1);
+        specializations[0].i = transB;
+
+        pipeline_gemm_quantize_B_absmax_int8 = new Pipeline(vkdev);
+        pipeline_gemm_quantize_B_absmax_int8->set_local_size_xyz(128, 1, 1);
+        pipeline_gemm_quantize_B_absmax_int8->create(LayerShaderType::gemm_quantize_B_absmax_int8, opt_int8, specializations);
+
+        pipeline_gemm_quantize_B_descale_int8 = new Pipeline(vkdev);
+        pipeline_gemm_quantize_B_descale_int8->set_local_size_xyz(128, 1, 1);
+        pipeline_gemm_quantize_B_descale_int8->create(LayerShaderType::gemm_quantize_B_descale_int8, opt_int8, std::vector<vk_specialization_type>());
+
+        pipeline_gemm_quantize_B_int8 = new Pipeline(vkdev);
+        pipeline_gemm_quantize_B_int8->set_optimal_local_size_xyz(Mat(64, 1, 1, (void*)0));
+        pipeline_gemm_quantize_B_int8->create(LayerShaderType::gemm_quantize_B_int8, opt_int8, specializations);
+    }
+
+    if (use_cooperative_matrix)
+    {
+        int outh = output_transpose ? constantN : constantM;
+        int out_elempack = outh ? (outh % 4 == 0 ? 4 : 1) : 0;
+        if (output_elempack)
+            out_elempack = output_elempack;
+
+        std::vector<vk_specialization_type> specializations(11 + 9);
+        specializations[0].f = alpha;
+        specializations[1].f = beta;
+        specializations[2].i = constantA;
+        specializations[3].i = constantB;
+        specializations[4].i = constantC;
+        specializations[5].i = constant_broadcast_type_C;
+        specializations[6].i = output_transpose;
+        specializations[7].u32 = constantM;
+        specializations[8].u32 = constantN;
+        specializations[9].u32 = constantK;
+        specializations[10].u32 = out_elempack;
+
+        specializations[11 + 0].u32 = coopmat_M;
+        specializations[11 + 1].u32 = coopmat_N;
+        specializations[11 + 2].u32 = coopmat_K;
+        specializations[11 + 3].u32 = coopmat_subgroup_size;
+        specializations[11 + 4].u32 = UNROLL_SG_M;
+        specializations[11 + 5].u32 = UNROLL_SG_N;
+        specializations[11 + 6].u32 = UNROLL_SG_K;
+        specializations[11 + 7].u32 = UNROLL_WG_M;
+        specializations[11 + 8].u32 = UNROLL_WG_N;
+
+        pipeline_gemm = new Pipeline(vkdev);
+        pipeline_gemm->set_subgroup_size(coopmat_subgroup_size);
+        pipeline_gemm->set_local_size_xyz(coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N, 1, 1);
+        pipeline_gemm->create(LayerShaderType::gemm_int8_cm, opt_int8, specializations);
+    }
+    else
+    {
+        std::vector<vk_specialization_type> specializations(5);
+        specializations[0].f = alpha;
+        specializations[1].f = beta;
+        specializations[2].i = constantC;
+        specializations[3].i = constant_broadcast_type_C;
+        specializations[4].i = output_transpose;
+
+        pipeline_gemm = new Pipeline(vkdev);
+        pipeline_gemm->set_local_size_xyz(8, 8, 1);
+        pipeline_gemm->create(LayerShaderType::gemm_int8, opt_int8, specializations);
+    }
+
+    if (opt.lightmode)
+    {
+        A_data.release();
+        B_data.release();
+        C_data.release();
+    }
+
+    return 0;
+}
+
+int Gemm_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt)
+{
+    Option opt_fp32 = opt;
+    opt_fp32.use_fp16_packed = false;
+    opt_fp32.use_fp16_storage = false;
+    opt_fp32.use_bf16_packed = false;
+    opt_fp32.use_bf16_storage = false;
+
+    if (constantA)
+    {
+        cmd.record_upload(A_data_int8_packed, A_data_gpu, opt);
+
+        A_data_int8_packed.release();
+
+        cmd.record_upload(A_data_int8_descales, A_data_int8_descales_gpu, opt_fp32);
+
+        A_data_int8_descales.release();
+        A_data_int8_scales.release();
+    }
+
+    if (constantB)
+    {
+        cmd.record_upload(B_data_int8_packed, B_data_gpu, opt);
+
+        B_data_int8_packed.release();
+
+        cmd.record_upload(B_data_int8_descales, B_data_int8_descales_gpu, opt_fp32);
+
+        B_data_int8_descales.release();
+    }
+
+    if (constantC && constant_broadcast_type_C != -1)
+    {
+        cmd.record_upload(C_data_packed, C_data_gpu, opt);
+
+        C_data_packed.release();
+    }
+
+    return 0;
+}
+
+int Gemm_vulkan::forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkMat& A0 = constantA ? A_data_gpu : bottom_blobs[0];
+    const VkMat& B0 = constantB ? B_data_gpu : constantA ? bottom_blobs[0] : bottom_blobs[1];
+
+    VkMat A = A0;
+    VkMat B = B0;
+
+    // Runtime int8 blobs do not carry scale metadata, so reject before recording int8 pipelines.
+    if (!constantA && A.elembits() == 8)
+    {
+        NCNN_LOGE("Gemm_vulkan int8 dynamic int8 A is not supported without input scale");
+        return -1;
+    }
+
+    if (!constantB && B.elembits() == 8)
+    {
+        NCNN_LOGE("Gemm_vulkan int8 dynamic int8 B is not supported without input scale");
+        return -1;
+    }
+
+    if (!constantA && A.elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        VkMat A_unpacked;
+        vkdev->convert_packing(A, A_unpacked, 1, cmd, opt_pack1);
+        A = A_unpacked;
+    }
+
+    if (!constantB && B.elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        VkMat B_unpacked;
+        vkdev->convert_packing(B, B_unpacked, 1, cmd, opt_pack1);
+        B = B_unpacked;
+    }
+
+    const int M = constantM ? constantM : transA ? A.w : (A.dims == 3 ? A.c : A.h);
+    const int K = constantK ? constantK : transA ? (A.dims == 3 ? A.c : A.h) : A.w;
+    const int N = constantN ? constantN : transB ? (B.dims == 3 ? B.c : B.h) : B.w;
+
+    VkMat C;
+    int broadcast_type_C = -1;
+    if (constantC && constant_broadcast_type_C != -1)
+    {
+        C = C_data_gpu;
+        broadcast_type_C = constant_broadcast_type_C;
+    }
+    else
+    {
+        VkMat C0;
+        if (constantA && constantB)
+        {
+            C0 = bottom_blobs.size() == 1 ? bottom_blobs[0] : VkMat();
+        }
+        else if (constantA)
+        {
+            C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat();
+        }
+        else if (constantB)
+        {
+            C0 = bottom_blobs.size() == 2 ? bottom_blobs[1] : VkMat();
+        }
+        else
+        {
+            C0 = bottom_blobs.size() == 3 ? bottom_blobs[2] : VkMat();
+        }
+
+        if (!C0.empty())
+        {
+            C = C0;
+            if (C.elempack != 1)
+            {
+                Option opt_pack1 = opt;
+                opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+                VkMat C_unpacked;
+                vkdev->convert_packing(C, C_unpacked, 1, cmd, opt_pack1);
+                C = C_unpacked;
+            }
+
+            if (C.dims == 1 && C.w == 1)
+            {
+                broadcast_type_C = 0;
+            }
+            if (C.dims == 1 && C.w == M)
+            {
+                broadcast_type_C = 1;
+            }
+            if (C.dims == 1 && C.w == N)
+            {
+                broadcast_type_C = 4;
+            }
+            if (C.dims == 2 && C.w == 1 && C.h == M)
+            {
+                broadcast_type_C = 2;
+            }
+            if (C.dims == 2 && C.w == N && C.h == M)
+            {
+                broadcast_type_C = 3;
+            }
+            if (C.dims == 2 && C.w == N && C.h == 1)
+            {
+                broadcast_type_C = 4;
+            }
+        }
+    }
+
+    if (!C.empty() && C.elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        VkMat C_unpacked;
+        vkdev->convert_packing(C, C_unpacked, 1, cmd, opt_pack1);
+        C = C_unpacked;
+    }
+
+    int out_elempack = 1;
+    if (use_cooperative_matrix)
+    {
+        int outh = output_transpose ? N : M;
+        out_elempack = outh % 4 == 0 ? 4 : 1;
+        if (output_elempack)
+            out_elempack = output_elempack;
+    }
+
+    size_t elemsize;
+    if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+    {
+        elemsize = 2u;
+    }
+    else
+    {
+        elemsize = 4u;
+    }
+    size_t out_elemsize = elemsize * out_elempack;
+
+    VkMat A_int8 = A;
+    VkMat A_int8_descales = A_data_int8_descales_gpu;
+    if (!constantA)
+    {
+        A_int8.create(K, M, (size_t)1u, 1, opt.workspace_vkallocator);
+        if (A_int8.empty())
+            return -100;
+
+        A_int8_descales.create(M, (size_t)4u, 1, opt.workspace_vkallocator);
+        if (A_int8_descales.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(3);
+        bindings[0] = A;
+        bindings[1] = A_int8;
+        bindings[2] = A_int8_descales;
+
+        std::vector<vk_constant_type> constants(4);
+        constants[0].i = M;
+        constants[1].i = K;
+        constants[2].i = A.dims;
+        constants[3].i = A.dims == 3 ? A.cstep : A.dims == 2 ? A.w : transA ? M : K;
+
+        VkMat dispatcher;
+        dispatcher.w = M;
+        dispatcher.h = 1;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_gemm_quantize_A_int8, bindings, constants, dispatcher);
+    }
+
+    VkMat B_int8 = B;
+    VkMat B_int8_descale = B_data_int8_descales_gpu;
+    if (!constantB)
+    {
+        const int size = N * K;
+        const int blocks = (size + 1023) / 1024;
+
+        B_int8.create(K, N, (size_t)1u, 1, opt.workspace_vkallocator);
+        if (B_int8.empty())
+            return -100;
+
+        B_int8_descale.create(1, (size_t)4u, 1, opt.workspace_vkallocator);
+        if (B_int8_descale.empty())
+            return -100;
+
+        VkMat B_absmax;
+        B_absmax.create(blocks, (size_t)4u, 1, opt.workspace_vkallocator);
+        if (B_absmax.empty())
+            return -100;
+
+        {
+            std::vector<VkMat> bindings(2);
+            bindings[0] = B;
+            bindings[1] = B_absmax;
+
+            std::vector<vk_constant_type> constants(5);
+            constants[0].i = N;
+            constants[1].i = K;
+            constants[2].i = B.dims;
+            constants[3].i = B.dims == 3 ? B.cstep : B.dims == 2 ? B.w : transB ? K : N;
+            constants[4].i = size;
+
+            VkMat dispatcher;
+            dispatcher.w = blocks * 128;
+            dispatcher.h = 1;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_gemm_quantize_B_absmax_int8, bindings, constants, dispatcher);
+        }
+
+        {
+            std::vector<VkMat> bindings(2);
+            bindings[0] = B_absmax;
+            bindings[1] = B_int8_descale;
+
+            std::vector<vk_constant_type> constants(1);
+            constants[0].i = blocks;
+
+            VkMat dispatcher;
+            dispatcher.w = 1;
+            dispatcher.h = 1;
+            dispatcher.c = 1;
+
+            cmd.record_pipeline(pipeline_gemm_quantize_B_descale_int8, bindings, constants, dispatcher);
+        }
+
+        std::vector<VkMat> bindings(4);
+        bindings[0] = B;
+        bindings[1] = B_int8;
+        bindings[2] = B_int8_descale;
+        bindings[3] = B_int8;
+
+        std::vector<vk_constant_type> constants(5);
+        constants[0].i = N;
+        constants[1].i = K;
+        constants[2].i = B.dims;
+        constants[3].i = B.dims == 3 ? B.cstep : B.dims == 2 ? B.w : transB ? K : N;
+        constants[4].i = size;
+
+        VkMat dispatcher;
+        dispatcher.w = (size + 3) / 4;
+        dispatcher.h = 1;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_gemm_quantize_B_int8, bindings, constants, dispatcher);
+    }
+
+    VkMat& top_blob = top_blobs[0];
+    if (output_transpose)
+    {
+        if (output_N1M)
+            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else
+    {
+        if (output_N1M)
+            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        else
+            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkMat> bindings(use_cooperative_matrix ? 7 : 6);
+    bindings[0] = top_blob;
+    bindings[1] = A_int8;
+    bindings[2] = B_int8;
+    bindings[3] = C;
+    bindings[4] = A_int8_descales;
+    bindings[5] = B_int8_descale;
+    if (use_cooperative_matrix)
+    {
+        bindings[6] = top_blob;
+    }
+
+    std::vector<vk_constant_type> constants(use_cooperative_matrix ? 6 : 5);
+    constants[0].u32 = M;
+    constants[1].u32 = N;
+    constants[2].u32 = K;
+    constants[3].i = broadcast_type_C;
+    constants[4].u32 = top_blob.dims == 3 ? top_blob.cstep : top_blob.w;
+    if (use_cooperative_matrix)
+    {
+        constants[5].u32 = out_elempack;
+    }
+
+    VkMat dispatcher;
+    if (use_cooperative_matrix)
+    {
+        const int blocks_x = (M + coopmat_M * UNROLL_SG_M * UNROLL_WG_M - 1) / (coopmat_M * UNROLL_SG_M * UNROLL_WG_M);
+        const int blocks_y = (N + coopmat_N * UNROLL_SG_N * UNROLL_WG_N - 1) / (coopmat_N * UNROLL_SG_N * UNROLL_WG_N);
+
+        dispatcher.w = (blocks_x * blocks_y) * (coopmat_subgroup_size * UNROLL_WG_M * UNROLL_WG_N);
+        dispatcher.h = 1;
+        dispatcher.c = 1;
+    }
+    else
+    {
+        dispatcher.w = (N + 3) / 4;
+        dispatcher.h = (M + 3) / 4;
+        dispatcher.c = 1;
+    }
+
+    cmd.record_pipeline(pipeline_gemm, bindings, constants, dispatcher);
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h
index 9b275ca240e1..c483f543ab98 100644
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -13,8 +13,6 @@ class Gemm_vulkan : public Gemm
 public:
     Gemm_vulkan();
 
-    virtual int load_param(const ParamDict& pd);
-
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
@@ -24,6 +22,13 @@ class Gemm_vulkan : public Gemm
     virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int upload_model_int8(VkTransfer& cmd, const Option& opt);
+    int forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+#endif
+
 public:
     Mat A_data_packed;
     Mat B_data_packed;
@@ -49,6 +54,21 @@ class Gemm_vulkan : public Gemm
     int UNROLL_SG_K;
     int UNROLL_WG_M;
     int UNROLL_WG_N;
+
+#if NCNN_INT8
+    Mat A_data_int8_packed;
+    Mat B_data_int8_packed;
+    Mat A_data_int8_descales;
+    Mat B_data_int8_descales;
+
+    VkMat A_data_int8_descales_gpu;
+    VkMat B_data_int8_descales_gpu;
+
+    Pipeline* pipeline_gemm_quantize_A_int8;
+    Pipeline* pipeline_gemm_quantize_B_absmax_int8;
+    Pipeline* pipeline_gemm_quantize_B_descale_int8;
+    Pipeline* pipeline_gemm_quantize_B_int8;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index f146ea2c5e1f..13a3f04496ff 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -5,6 +5,7 @@
 
 #include "layer_shader_type.h"
 #include "layer_type.h"
+#include "modelbin.h"
 
 namespace ncnn {
 
@@ -16,15 +17,29 @@ InnerProduct_vulkan::InnerProduct_vulkan()
     flatten = 0;
 
     pipeline_innerproduct = 0;
-
     pipeline_innerproduct_sum8 = 0;
     pipeline_innerproduct_reduce_sum8 = 0;
-
     pipeline_innerproduct_gemm = 0;
+
+#if NCNN_INT8
+    quantize = 0;
+
+    pipeline_innerproduct_int8 = 0;
+    pipeline_innerproduct_sum8_int8 = 0;
+    pipeline_innerproduct_reduce_sum8_int8 = 0;
+    pipeline_innerproduct_gemm_int8 = 0;
+#endif
 }
 
 int InnerProduct_vulkan::create_pipeline(const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8(opt);
+    }
+#endif
+
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
 
@@ -279,7 +294,6 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt)
         delete flatten;
         flatten = 0;
     }
-
     delete pipeline_innerproduct;
     pipeline_innerproduct = 0;
 
@@ -291,11 +305,38 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt)
     delete pipeline_innerproduct_gemm;
     pipeline_innerproduct_gemm = 0;
 
+#if NCNN_INT8
+    if (quantize)
+    {
+        quantize->destroy_pipeline(opt);
+        delete quantize;
+        quantize = 0;
+    }
+
+    delete pipeline_innerproduct_int8;
+    pipeline_innerproduct_int8 = 0;
+
+    delete pipeline_innerproduct_sum8_int8;
+    delete pipeline_innerproduct_reduce_sum8_int8;
+    pipeline_innerproduct_sum8_int8 = 0;
+    pipeline_innerproduct_reduce_sum8_int8 = 0;
+
+    delete pipeline_innerproduct_gemm_int8;
+    pipeline_innerproduct_gemm_int8 = 0;
+#endif
+
     return 0;
 }
 
 int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
+#if NCNN_INT8
+    if (int8_scale_term && weight_data_int8_packed.elembits() == 8)
+    {
+        return upload_model_int8(cmd, opt);
+    }
+#endif
+
     cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
 
     weight_data_packed.release();
@@ -312,6 +353,13 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
 int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        return forward_int8(bottom_blob, top_blob, cmd, opt);
+    }
+#endif
+
     const int num_input = weight_data_size / num_output;
 
     int in_elempack = num_input % 4 == 0 ? 4 : 1;
@@ -463,4 +511,506 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
     return 0;
 }
 
+#if NCNN_INT8
+int InnerProduct_vulkan::create_pipeline_int8(const Option& opt)
+{
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    const int num_input = weight_data_size / num_output;
+
+    const int num_input_packed = (num_input + 7) / 8 * 8;
+    const int num_output_packed = (num_output + 3) / 4 * 4;
+
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_int8_packed.create(num_input_packed / 4, num_output_packed / 4, (size_t)16u, 16);
+        weight_data_int8_packed.fill(0);
+
+        for (int q = 0; q < num_output_packed; q += 4)
+        {
+            signed char* g00 = weight_data_int8_packed.row<signed char>(q / 4);
+
+            for (int p = 0; p < num_input_packed; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k0 = q + i < num_output && p < num_input ? weight_data_r2.row<const signed char>(q + i) + p : 0;
+
+                    for (int j = 0; j < 4; j++)
+                    {
+                        g00[0] = k0 && p + j < num_input ? k0[j] : 0;
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+
+    {
+        const float bottom_blob_int8_scale = bottom_blob_int8_scales.empty() ? 1.f : bottom_blob_int8_scales[0];
+        const float bottom_blob_int8_descale = bottom_blob_int8_scale == 0.f ? 0.f : 1.f / bottom_blob_int8_scale;
+
+        weight_data_int8_descales.create(num_output_packed / 4, (size_t)4u * 4, 4);
+        if (weight_data_int8_descales.empty())
+            return -100;
+        weight_data_int8_descales.fill(0.f);
+
+        float* outptr = weight_data_int8_descales;
+        for (int q = 0; q < num_output; q++)
+        {
+            float scale = weight_data_int8_scales[q];
+            outptr[q] = scale == 0.f ? 0.f : bottom_blob_int8_descale / scale;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data_int8_packed.create(num_output_packed / 4, (size_t)4u * 4, 4);
+        if (bias_data_int8_packed.empty())
+            return -100;
+        bias_data_int8_packed.fill(0.f);
+
+        float* outptr = bias_data_int8_packed;
+        for (int q = 0; q < num_output; q++)
+        {
+            outptr[q] = bias_data[q];
+        }
+    }
+
+    Option opt_int8 = opt;
+    opt_int8.use_fp16_arithmetic = false;
+    opt_int8.use_int16_packed = false;
+    opt_int8.use_int16_storage = false;
+
+    {
+        quantize = ncnn::create_layer_vulkan(ncnn::LayerType::Quantize);
+        quantize->vkdev = vkdev;
+
+        Mat shape_quantize;
+        Mat out_shape_quantize;
+        if (shape.dims == 2 && shape.w == num_input)
+        {
+            const size_t elemsize = shape.elempack == 0 ? (size_t)4u : shape.elemsize / shape.elempack;
+            shape_quantize = Mat(shape.w, shape.h, (void*)0, elemsize * shape.elempack, shape.elempack);
+            out_shape_quantize = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack);
+        }
+        else if (shape.dims != 0)
+        {
+            const int total = shape.w * shape.h * shape.d * shape.c * shape.elempack;
+            const int flatten_elempack = total % 4 == 0 ? 4 : 1;
+            const size_t elemsize = shape.elempack == 0 ? (size_t)4u : shape.elemsize / shape.elempack;
+            shape_quantize = Mat(total / flatten_elempack, (void*)0, elemsize * flatten_elempack, flatten_elempack);
+            out_shape_quantize = Mat(total / flatten_elempack, (void*)0, (size_t)flatten_elempack, flatten_elempack);
+        }
+
+        quantize->bottom_shapes.resize(1);
+        quantize->bottom_shapes[0] = shape_quantize;
+        quantize->top_shapes.resize(1);
+        quantize->top_shapes[0] = out_shape_quantize;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 1);
+        quantize->load_param(pd);
+
+        Mat weights[1];
+        weights[0] = bottom_blob_int8_scales;
+        quantize->load_model(ModelBinFromMatArray(weights));
+
+        Option opt_quantize = opt;
+        opt_quantize.use_fp16_arithmetic = false;
+
+        quantize->create_pipeline(opt_quantize);
+    }
+
+    if (shape.dims == 2 && shape.w == num_input)
+    {
+        // gemm
+        Mat shape_unpacked(num_input, shape.h * shape.elempack, (void*)0);
+        Mat out_shape_unpacked(num_output, out_shape.dims == 0 ? 0 : out_shape.h * out_shape.elempack, (void*)0);
+
+        std::vector<vk_specialization_type> specializations(6 + 10);
+        specializations[0].i = bias_term;
+        specializations[1].i = activation_type;
+        specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[4].i = shape.elempack;
+        specializations[5].i = num_input_packed / 4;
+        specializations[6 + 0].i = shape_unpacked.dims;
+        specializations[6 + 1].i = shape_unpacked.w;
+        specializations[6 + 2].i = shape_unpacked.h;
+        specializations[6 + 3].i = shape.elempack;
+        specializations[6 + 4].i = shape.w;
+        specializations[6 + 5].i = out_shape_unpacked.dims;
+        specializations[6 + 6].i = out_shape_unpacked.w;
+        specializations[6 + 7].i = out_shape_unpacked.h;
+        specializations[6 + 8].i = out_shape_unpacked.c;
+        specializations[6 + 9].i = out_shape.w;
+
+        Mat local_size_xyz(std::min(16, (num_output + 3) / 4), 4, 1, (void*)0);
+        if (out_shape_unpacked.dims != 0)
+        {
+            local_size_xyz.w = std::min(16, (out_shape_unpacked.w + 3) / 4);
+            local_size_xyz.h = std::min(4, out_shape_unpacked.h);
+            local_size_xyz.c = 1;
+        }
+
+        pipeline_innerproduct_gemm_int8 = new Pipeline(vkdev);
+        if (opt.use_shader_local_memory)
+        {
+            pipeline_innerproduct_gemm_int8->set_local_size_xyz(8, 8, 1);
+        }
+        else
+        {
+            pipeline_innerproduct_gemm_int8->set_optimal_local_size_xyz(local_size_xyz);
+        }
+        pipeline_innerproduct_gemm_int8->create(LayerShaderType::innerproduct_gemm_int8, opt_int8, specializations);
+
+        if (opt.lightmode)
+        {
+            weight_data.release();
+        }
+
+        return 0;
+    }
+
+    size_t elemsize;
+    if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+    {
+        elemsize = 2u;
+    }
+    else
+    {
+        elemsize = 4u;
+    }
+
+    Mat shape_flatten;
+    if (shape.dims != 0)
+    {
+        const int total = shape.w * shape.h * shape.d * shape.c * shape.elempack;
+        const int flatten_elempack = total % 4 == 0 ? 4 : 1;
+        shape_flatten = Mat(total / flatten_elempack, (void*)0, elemsize * flatten_elempack, flatten_elempack);
+    }
+
+    {
+        flatten = ncnn::create_layer_vulkan(ncnn::LayerType::Flatten);
+        flatten->vkdev = vkdev;
+
+        flatten->bottom_shapes.resize(1);
+        flatten->bottom_shapes[0] = shape;
+        flatten->top_shapes.resize(1);
+        flatten->top_shapes[0] = shape_flatten;
+
+        ncnn::ParamDict pd;
+
+        flatten->load_param(pd);
+
+        flatten->create_pipeline(opt);
+    }
+
+    if (num_input_packed / 4 >= 32)
+    {
+        const int outw_sum8 = (num_input_packed / 4 + 7) / 8;
+        const int outh_sum8 = num_output_packed / 4;
+
+        // sum8
+        {
+            std::vector<vk_specialization_type> specializations(1 + 3);
+            specializations[0].i = num_input_packed / 4;
+            specializations[1 + 0].i = shape_flatten.w * shape_flatten.elempack;
+            specializations[1 + 1].i = outw_sum8;
+            specializations[1 + 2].i = outh_sum8;
+
+            pipeline_innerproduct_sum8_int8 = new Pipeline(vkdev);
+            pipeline_innerproduct_sum8_int8->set_local_size_xyz(8, std::min(8, outh_sum8), 1);
+            pipeline_innerproduct_sum8_int8->create(LayerShaderType::innerproduct_sum8_int8, opt_int8, specializations);
+        }
+
+        // reduce sum8
+        {
+            std::vector<vk_specialization_type> specializations(4 + 3);
+            specializations[0].i = bias_term;
+            specializations[1].i = activation_type;
+            specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+            specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+            specializations[4 + 0].i = outw_sum8;
+            specializations[4 + 1].i = outh_sum8;
+            specializations[4 + 2].i = (num_output + 3) / 4;
+
+            pipeline_innerproduct_reduce_sum8_int8 = new Pipeline(vkdev);
+            pipeline_innerproduct_reduce_sum8_int8->set_local_size_xyz(std::min(64, (num_output + 3) / 4), 1, 1);
+            pipeline_innerproduct_reduce_sum8_int8->create(LayerShaderType::innerproduct_reduce_sum8_int8, opt_int8, specializations);
+        }
+    }
+    else
+    {
+        std::vector<vk_specialization_type> specializations(5 + 2);
+        specializations[0].i = bias_term;
+        specializations[1].i = activation_type;
+        specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[4].i = num_input_packed / 4;
+        specializations[5 + 0].i = shape_flatten.w * shape_flatten.elempack;
+        specializations[5 + 1].i = num_output;
+
+        Mat local_size_xyz(std::min(64, (num_output + 3) / 4), 1, 1, (void*)0);
+        if (out_shape.dims != 0)
+        {
+            local_size_xyz.w = std::min(64, (num_output + 3) / 4);
+            local_size_xyz.h = 1;
+            local_size_xyz.c = 1;
+        }
+
+        pipeline_innerproduct_int8 = new Pipeline(vkdev);
+        pipeline_innerproduct_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_int8->create(LayerShaderType::innerproduct_int8, opt_int8, specializations);
+    }
+
+    // gemm for no shape hint
+    if (shape.dims == 0)
+    {
+        std::vector<vk_specialization_type> specializations(6 + 10);
+        specializations[0].i = bias_term;
+        specializations[1].i = activation_type;
+        specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[4].i = 0;
+        specializations[5].i = num_input_packed / 4;
+        specializations[6 + 0].i = 0;
+        specializations[6 + 1].i = 0;
+        specializations[6 + 2].i = 0;
+        specializations[6 + 3].i = 0;
+        specializations[6 + 4].i = 0;
+        specializations[6 + 5].i = 0;
+        specializations[6 + 6].i = 0;
+        specializations[6 + 7].i = 0;
+        specializations[6 + 8].i = 0;
+        specializations[6 + 9].i = 0;
+
+        Mat local_size_xyz(std::min(16, (num_output + 3) / 4), 4, 1, (void*)0);
+
+        pipeline_innerproduct_gemm_int8 = new Pipeline(vkdev);
+        if (opt.use_shader_local_memory)
+        {
+            pipeline_innerproduct_gemm_int8->set_local_size_xyz(8, 8, 1);
+        }
+        else
+        {
+            pipeline_innerproduct_gemm_int8->set_optimal_local_size_xyz(local_size_xyz);
+        }
+        pipeline_innerproduct_gemm_int8->create(LayerShaderType::innerproduct_gemm_int8, opt_int8, specializations);
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_vulkan::upload_model_int8(VkTransfer& cmd, const Option& opt)
+{
+    cmd.record_upload(weight_data_int8_packed, weight_data_gpu, opt);
+
+    weight_data_int8_packed.release();
+
+    cmd.record_upload(weight_data_int8_descales, weight_data_int8_descales_gpu, opt);
+
+    weight_data_int8_descales.release();
+    weight_data_int8_scales.release();
+
+    if (bias_term)
+    {
+        cmd.record_upload(bias_data_int8_packed, bias_data_gpu, opt);
+
+        bias_data_int8_packed.release();
+        bias_data.release();
+    }
+
+    quantize->upload_model(cmd, opt);
+
+    return 0;
+}
+
+int InnerProduct_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+    const int out_elempack = num_output % 4 == 0 ? 4 : 1;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
+    {
+        // gemm
+        VkMat bottom_blob_quantized = bottom_blob;
+
+        if (bottom_blob_quantized.elembits() != 8)
+        {
+            Option opt_quantize = opt;
+            opt_quantize.blob_vkallocator = opt.workspace_vkallocator;
+            opt_quantize.use_fp16_arithmetic = false;
+
+            VkMat bottom_blob_int8;
+            int ret = quantize->forward(bottom_blob_quantized, bottom_blob_int8, cmd, opt_quantize);
+            if (ret != 0)
+                return ret;
+
+            bottom_blob_quantized = bottom_blob_int8;
+        }
+
+        const int h = bottom_blob_quantized.h;
+        const int elempack = bottom_blob_quantized.elempack;
+        const int outh = h * elempack;
+        size_t out_elemsize;
+        if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+        {
+            out_elemsize = elempack * 2u;
+        }
+        else
+        {
+            out_elemsize = elempack * 4u;
+        }
+
+        top_blob.create(num_output, h, out_elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(7);
+        bindings[0] = bottom_blob_quantized;
+        bindings[1] = top_blob;
+        bindings[2] = weight_data_gpu;
+        bindings[3] = weight_data_int8_descales_gpu;
+        bindings[4] = bias_data_gpu;
+        bindings[5] = top_blob;
+        bindings[6] = bottom_blob_quantized;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_blob_quantized.dims;
+        constants[1].i = num_input;
+        constants[2].i = outh;
+        constants[3].i = elempack;
+        constants[4].i = bottom_blob_quantized.w;
+        constants[5].i = top_blob.dims;
+        constants[6].i = num_output;
+        constants[7].i = outh;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+
+        VkMat dispatcher;
+        dispatcher.w = (num_output + 3) / 4;
+        dispatcher.h = (outh + 3) / 4;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_innerproduct_gemm_int8, bindings, constants, dispatcher);
+
+        return 0;
+    }
+
+    // flatten
+    VkMat bottom_blob_flattened = bottom_blob;
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_vkallocator = opt.workspace_vkallocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten);
+    }
+
+    if (bottom_blob_flattened.elembits() != 8)
+    {
+        Option opt_quantize = opt;
+        opt_quantize.blob_vkallocator = opt.workspace_vkallocator;
+        opt_quantize.use_fp16_arithmetic = false;
+
+        VkMat bottom_blob_int8;
+        int ret = quantize->forward(bottom_blob_flattened, bottom_blob_int8, cmd, opt_quantize);
+        if (ret != 0)
+            return ret;
+
+        bottom_blob_flattened = bottom_blob_int8;
+    }
+
+    size_t out_elemsize;
+    if (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed)
+    {
+        out_elemsize = out_elempack * 2u;
+    }
+    else
+    {
+        out_elemsize = out_elempack * 4u;
+    }
+
+    const int num_input_packed = (num_input + 7) / 8 * 8;
+    const int num_output_packed = (num_output + 3) / 4 * 4;
+
+    if (num_input_packed / 4 >= 32)
+    {
+        // sum8
+        VkMat top_blob_sum8;
+        {
+            top_blob_sum8.create((num_input_packed / 4 + 7) / 8, num_output_packed / 4, (size_t)4u * 4, 4, opt.blob_vkallocator);
+            if (top_blob_sum8.empty())
+                return -100;
+
+            std::vector<VkMat> bindings(3);
+            bindings[0] = bottom_blob_flattened;
+            bindings[1] = top_blob_sum8;
+            bindings[2] = weight_data_gpu;
+
+            std::vector<vk_constant_type> constants(3);
+            constants[0].i = bottom_blob_flattened.w * bottom_blob_flattened.elempack;
+            constants[1].i = top_blob_sum8.w;
+            constants[2].i = top_blob_sum8.h;
+
+            cmd.record_pipeline(pipeline_innerproduct_sum8_int8, bindings, constants, top_blob_sum8);
+        }
+
+        // reduce sum8
+        {
+            top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+            if (top_blob.empty())
+                return -100;
+
+            std::vector<VkMat> bindings(4);
+            bindings[0] = top_blob_sum8;
+            bindings[1] = top_blob;
+            bindings[2] = weight_data_int8_descales_gpu;
+            bindings[3] = bias_data_gpu;
+
+            std::vector<vk_constant_type> constants(3);
+            constants[0].i = top_blob_sum8.w;
+            constants[1].i = top_blob_sum8.h;
+            constants[2].i = (num_output + 3) / 4;
+
+            cmd.record_pipeline(pipeline_innerproduct_reduce_sum8_int8, bindings, constants, top_blob);
+        }
+
+        return 0;
+    }
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkMat> bindings(5);
+    bindings[0] = bottom_blob_flattened;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = weight_data_int8_descales_gpu;
+    bindings[4] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(2);
+    constants[0].i = bottom_blob_flattened.w * bottom_blob_flattened.elempack;
+    constants[1].i = num_output;
+
+    VkMat dispatcher;
+    dispatcher.w = (num_output + 3) / 4;
+    dispatcher.h = 1;
+    dispatcher.c = 1;
+
+    cmd.record_pipeline(pipeline_innerproduct_int8, bindings, constants, dispatcher);
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h
index 8495b2296167..63f6fe4beeb1 100644
--- a/src/layer/vulkan/innerproduct_vulkan.h
+++ b/src/layer/vulkan/innerproduct_vulkan.h
@@ -21,6 +21,13 @@ class InnerProduct_vulkan : public InnerProduct
     using InnerProduct::forward;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int upload_model_int8(VkTransfer& cmd, const Option& opt);
+    int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+#endif
+
 public:
     ncnn::Layer* flatten;
 
@@ -35,6 +42,21 @@ class InnerProduct_vulkan : public InnerProduct
     Pipeline* pipeline_innerproduct_reduce_sum8;
 
     Pipeline* pipeline_innerproduct_gemm;
+
+#if NCNN_INT8
+    ncnn::Layer* quantize;
+
+    Mat weight_data_int8_packed;
+    Mat weight_data_int8_descales;
+    Mat bias_data_int8_packed;
+
+    VkMat weight_data_int8_descales_gpu;
+
+    Pipeline* pipeline_innerproduct_int8;
+    Pipeline* pipeline_innerproduct_sum8_int8;
+    Pipeline* pipeline_innerproduct_reduce_sum8_int8;
+    Pipeline* pipeline_innerproduct_gemm_int8;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/padding_vulkan.cpp b/src/layer/vulkan/padding_vulkan.cpp
index 7e2120ca59f4..ea1242d04b93 100644
--- a/src/layer/vulkan/padding_vulkan.cpp
+++ b/src/layer/vulkan/padding_vulkan.cpp
@@ -19,6 +19,16 @@ Padding_vulkan::Padding_vulkan()
 
     pipeline_padding_3d = 0;
     pipeline_padding_3d_pack4 = 0;
+
+#if NCNN_INT8
+    pipeline_padding_int8 = 0;
+    pipeline_padding_pack4_int8 = 0;
+    pipeline_padding_pack1to4_int8 = 0;
+    pipeline_padding_pack4to1_int8 = 0;
+
+    pipeline_padding_3d_int8 = 0;
+    pipeline_padding_3d_pack4_int8 = 0;
+#endif // NCNN_INT8
 }
 
 int Padding_vulkan::create_pipeline(const Option& opt)
@@ -26,6 +36,25 @@ int Padding_vulkan::create_pipeline(const Option& opt)
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
 
+#if NCNN_INT8
+    if (per_channel_pad_data_size)
+    {
+        per_channel_pad_data_int8.create((per_channel_pad_data_size + 3) / 4 * 4, (size_t)1u, 1);
+        if (per_channel_pad_data_int8.empty())
+            return -100;
+
+        signed char* outptr = per_channel_pad_data_int8;
+        for (int i = 0; i < per_channel_pad_data_int8.w; i++)
+        {
+            outptr[i] = 0;
+        }
+        for (int i = 0; i < per_channel_pad_data_size; i++)
+        {
+            outptr[i] = static_cast<signed char>((float)per_channel_pad_data[i]);
+        }
+    }
+#endif // NCNN_INT8
+
     int offset_elempack = 1;
     if (shape.dims == 1)
     {
@@ -125,24 +154,30 @@ int Padding_vulkan::create_pipeline(const Option& opt)
     }
 
     // pack1
-    if (out_shape.dims == 0 || (offset_elempack == 1 && out_shape.elempack == 1))
+    if (out_shape.dims == 0 || (out_shape.dims != 4 && offset_elempack == 1 && out_shape.elempack == 1))
     {
         pipeline_padding = new Pipeline(vkdev);
         pipeline_padding->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_padding->create(LayerShaderType::padding, opt, specializations);
+    }
 
+    if (out_shape.dims == 0 || (out_shape.dims == 4 && offset_elempack == 1 && out_shape.elempack == 1))
+    {
         pipeline_padding_3d = new Pipeline(vkdev);
         pipeline_padding_3d->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_padding_3d->create(LayerShaderType::padding_3d, opt, specializations_3d);
     }
 
     // pack4
-    if (out_shape.dims == 0 || (offset_elempack == 4 && out_shape.elempack == 4))
+    if (out_shape.dims == 0 || (out_shape.dims != 4 && offset_elempack == 4 && out_shape.elempack == 4))
     {
         pipeline_padding_pack4 = new Pipeline(vkdev);
         pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_padding_pack4->create(LayerShaderType::padding_pack4, opt, specializations);
+    }
 
+    if (out_shape.dims == 0 || (out_shape.dims == 4 && offset_elempack == 4 && out_shape.elempack == 4))
+    {
         pipeline_padding_3d_pack4 = new Pipeline(vkdev);
         pipeline_padding_3d_pack4->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_padding_3d_pack4->create(LayerShaderType::padding_3d_pack4, opt, specializations_3d);
@@ -164,6 +199,13 @@ int Padding_vulkan::create_pipeline(const Option& opt)
         pipeline_padding_pack4to1->create(LayerShaderType::padding_pack4to1, opt, specializations);
     }
 
+#if NCNN_INT8
+    if (opt.use_int8_packed || opt.use_int8_storage)
+    {
+        return create_pipeline_int8(opt);
+    }
+#endif // NCNN_INT8
+
     return 0;
 }
 
@@ -187,6 +229,26 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_padding_3d_pack4;
     pipeline_padding_3d_pack4 = 0;
 
+#if NCNN_INT8
+    delete pipeline_padding_int8;
+    pipeline_padding_int8 = 0;
+
+    delete pipeline_padding_pack4_int8;
+    pipeline_padding_pack4_int8 = 0;
+
+    delete pipeline_padding_pack1to4_int8;
+    pipeline_padding_pack1to4_int8 = 0;
+
+    delete pipeline_padding_pack4to1_int8;
+    pipeline_padding_pack4to1_int8 = 0;
+
+    delete pipeline_padding_3d_int8;
+    pipeline_padding_3d_int8 = 0;
+
+    delete pipeline_padding_3d_pack4_int8;
+    pipeline_padding_3d_pack4_int8 = 0;
+#endif // NCNN_INT8
+
     return 0;
 }
 
@@ -197,8 +259,15 @@ int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
     cmd.record_upload(per_channel_pad_data, per_channel_pad_data_gpu, opt);
 
+#if NCNN_INT8
+    cmd.record_upload(per_channel_pad_data_int8, per_channel_pad_data_int8_gpu, opt);
+#endif // NCNN_INT8
+
     if (opt.lightmode)
     {
+#if NCNN_INT8
+        per_channel_pad_data_int8.release();
+#endif // NCNN_INT8
         per_channel_pad_data.release();
     }
 
@@ -207,6 +276,11 @@ int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
 int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
+#if NCNN_INT8
+    if (bottom_blob.elembits() == 8)
+        return forward_int8(bottom_blob, top_blob, cmd, opt);
+#endif // NCNN_INT8
+
     int dims = bottom_blob.dims;
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -383,6 +457,12 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
 int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
 {
     const VkMat& bottom_blob = bottom_blobs[0];
+
+#if NCNN_INT8
+    if (bottom_blob.elembits() == 8)
+        return forward_int8(bottom_blobs, top_blobs, cmd, opt);
+#endif // NCNN_INT8
+
     const VkMat& reference_blob = bottom_blobs[1];
 
     VkMat& top_blob = top_blobs[0];
@@ -576,4 +656,541 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
     return 0;
 }
 
+#if NCNN_INT8
+int Padding_vulkan::create_pipeline_int8(const Option& opt)
+{
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    Mat shape_int8;
+    if (shape.dims == 1) shape_int8 = Mat(shape.w, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 2) shape_int8 = Mat(shape.w, shape.h, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 3) shape_int8 = Mat(shape.w, shape.h, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack);
+    if (shape.dims == 4) shape_int8 = Mat(shape.w, shape.h, shape.d, shape.c, (void*)0, (size_t)shape.elempack, shape.elempack);
+
+    Mat out_shape_int8;
+    if (out_shape.dims == 1) out_shape_int8 = Mat(out_shape.w, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 2) out_shape_int8 = Mat(out_shape.w, out_shape.h, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 3) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+    if (out_shape.dims == 4) out_shape_int8 = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c, (void*)0, (size_t)out_shape.elempack, out_shape.elempack);
+
+    int offset_elempack = 1;
+    if (shape_int8.dims == 1)
+    {
+        if (left == 0)
+            offset_elempack = shape_int8.elempack;
+        else
+            offset_elempack = left % 4 == 0 ? 4 : 1;
+    }
+    else if (shape_int8.dims == 2)
+    {
+        if (top == 0)
+            offset_elempack = shape_int8.elempack;
+        else
+            offset_elempack = top % 4 == 0 ? 4 : 1;
+    }
+    else if (shape_int8.dims == 3)
+    {
+        if (front == 0)
+            offset_elempack = shape_int8.elempack;
+        else
+            offset_elempack = front % 4 == 0 ? 4 : 1;
+    }
+    else // if (shape_int8.dims == 4)
+    {
+        offset_elempack = shape_int8.elempack;
+    }
+
+    offset_elempack = std::min(offset_elempack, shape_int8.elempack);
+
+    Mat shape_unpacked = shape_int8;
+    if (one_blob_only && shape_int8.dims != 0 && shape_int8.elempack > offset_elempack)
+    {
+        size_t offset_elemsize = shape_int8.elemsize / shape_int8.elempack * offset_elempack;
+
+        if (shape_int8.dims == 1) shape_unpacked = Mat(shape_int8.w * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+        if (shape_int8.dims == 2) shape_unpacked = Mat(shape_int8.w, shape_int8.h * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+        if (shape_int8.dims == 3) shape_unpacked = Mat(shape_int8.w, shape_int8.h, shape_int8.c * shape_int8.elempack / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+        // if (shape_int8.dims == 4) should never reach here
+    }
+
+    std::vector<vk_specialization_type> specializations(3 + 10);
+    specializations[0].i = type;
+    specializations[1].f = value;
+    specializations[2].i = per_channel_pad_data_size ? 1 : 0;
+    specializations[3 + 0].i = shape_unpacked.dims;
+    specializations[3 + 1].i = shape_unpacked.w;
+    specializations[3 + 2].i = shape_unpacked.h;
+    specializations[3 + 3].i = shape_unpacked.c;
+    specializations[3 + 4].i = shape_unpacked.cstep;
+    specializations[3 + 5].i = out_shape_int8.dims;
+    specializations[3 + 6].i = out_shape_int8.w;
+    specializations[3 + 7].i = out_shape_int8.h;
+    specializations[3 + 8].i = out_shape_int8.c;
+    specializations[3 + 9].i = out_shape_int8.cstep;
+
+    std::vector<vk_specialization_type> specializations_3d(3 + 12);
+    specializations_3d[0].i = type;
+    specializations_3d[1].f = value;
+    specializations_3d[2].i = per_channel_pad_data_size ? 1 : 0;
+    specializations_3d[3 + 0].i = shape_unpacked.dims;
+    specializations_3d[3 + 1].i = shape_unpacked.w;
+    specializations_3d[3 + 2].i = shape_unpacked.h;
+    specializations_3d[3 + 3].i = shape_unpacked.d;
+    specializations_3d[3 + 4].i = shape_unpacked.c;
+    specializations_3d[3 + 5].i = shape_unpacked.cstep;
+    specializations_3d[3 + 6].i = out_shape_int8.dims;
+    specializations_3d[3 + 7].i = out_shape_int8.w;
+    specializations_3d[3 + 8].i = out_shape_int8.h;
+    specializations_3d[3 + 9].i = out_shape_int8.d;
+    specializations_3d[3 + 10].i = out_shape_int8.c;
+    specializations_3d[3 + 11].i = out_shape_int8.cstep;
+
+    Mat local_size_xyz;
+    if (out_shape_int8.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, out_shape_int8.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_int8.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, out_shape_int8.w);
+        local_size_xyz.h = std::min(8, out_shape_int8.h);
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_int8.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, out_shape_int8.w);
+        local_size_xyz.h = std::min(4, out_shape_int8.h);
+        local_size_xyz.c = std::min(4, out_shape_int8.c);
+    }
+    if (out_shape_int8.dims == 4)
+    {
+        local_size_xyz.w = std::min(4, out_shape_int8.w);
+        local_size_xyz.h = std::min(4, out_shape_int8.h * out_shape_int8.d);
+        local_size_xyz.c = std::min(4, out_shape_int8.c);
+    }
+
+    // pack1
+    if (out_shape_int8.dims == 0 || (out_shape_int8.dims != 4 && offset_elempack == 1 && out_shape_int8.elempack == 1))
+    {
+        pipeline_padding_int8 = new Pipeline(vkdev);
+        pipeline_padding_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_int8->create(LayerShaderType::padding_int8, opt, specializations);
+    }
+
+    if (out_shape_int8.dims == 0 || (out_shape_int8.dims == 4 && offset_elempack == 1 && out_shape_int8.elempack == 1))
+    {
+        pipeline_padding_3d_int8 = new Pipeline(vkdev);
+        pipeline_padding_3d_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_3d_int8->create(LayerShaderType::padding_3d_int8, opt, specializations_3d);
+    }
+
+    // pack4
+    if (out_shape_int8.dims == 0 || (out_shape_int8.dims != 4 && offset_elempack == 4 && out_shape_int8.elempack == 4))
+    {
+        pipeline_padding_pack4_int8 = new Pipeline(vkdev);
+        pipeline_padding_pack4_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_pack4_int8->create(LayerShaderType::padding_pack4_int8, opt, specializations);
+    }
+
+    if (out_shape_int8.dims == 0 || (out_shape_int8.dims == 4 && offset_elempack == 4 && out_shape_int8.elempack == 4))
+    {
+        pipeline_padding_3d_pack4_int8 = new Pipeline(vkdev);
+        pipeline_padding_3d_pack4_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_3d_pack4_int8->create(LayerShaderType::padding_3d_pack4_int8, opt, specializations_3d);
+    }
+
+    // pack1to4
+    if (out_shape_int8.dims == 0 || (offset_elempack == 1 && out_shape_int8.elempack == 4))
+    {
+        pipeline_padding_pack1to4_int8 = new Pipeline(vkdev);
+        pipeline_padding_pack1to4_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_pack1to4_int8->create(LayerShaderType::padding_pack1to4_int8, opt, specializations);
+    }
+
+    // pack4to1
+    if (out_shape_int8.dims == 0 || (offset_elempack == 4 && out_shape_int8.elempack == 1))
+    {
+        pipeline_padding_pack4to1_int8 = new Pipeline(vkdev);
+        pipeline_padding_pack4to1_int8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_pack4to1_int8->create(LayerShaderType::padding_pack4to1_int8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Padding_vulkan::forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = 0;
+    int outh = 0;
+    int outd = 0;
+    int outc = 0;
+
+    int offset_elempack;
+    int out_elempack;
+
+    if (dims == 1)
+    {
+        if (left == 0 && right == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w * elempack + left + right;
+        out_elempack = outw % 4 == 0 ? 4 : 1;
+        offset_elempack = left == 0 ? elempack : left % 4 == 0 ? 4 : 1;
+    }
+    else if (dims == 2)
+    {
+        if (top == 0 && bottom == 0 && left == 0 && right == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + left + right;
+        outh = h * elempack + top + bottom;
+        out_elempack = outh % 4 == 0 ? 4 : 1;
+        offset_elempack = top == 0 ? elempack : top % 4 == 0 ? 4 : 1;
+    }
+    else if (dims == 3)
+    {
+        if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + left + right;
+        outh = h + top + bottom;
+        outc = channels * elempack + front + behind;
+        out_elempack = outc % 4 == 0 ? 4 : 1;
+        offset_elempack = front == 0 ? elempack : front % 4 == 0 ? 4 : 1;
+    }
+    else // if (dims == 4)
+    {
+        if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + left + right;
+        outh = h + top + bottom;
+        outd = d + front + behind;
+        outc = channels * elempack;
+        out_elempack = elempack;
+        offset_elempack = elempack;
+    }
+
+    offset_elempack = std::min(offset_elempack, elempack);
+
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    // unpacking
+    VkMat bottom_blob_unpacked = bottom_blob;
+    if (elempack > offset_elempack)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1);
+    }
+
+    if (dims == 1)
+    {
+        top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else if (dims == 2)
+    {
+        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else // if (dims == 4)
+    {
+        top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkMat> bindings(3);
+    bindings[0] = bottom_blob_unpacked;
+    bindings[1] = top_blob;
+    bindings[2] = per_channel_pad_data_int8_gpu;
+
+    if (dims == 4)
+    {
+        std::vector<vk_constant_type> constants(15);
+        constants[0].i = bottom_blob_unpacked.dims;
+        constants[1].i = bottom_blob_unpacked.w;
+        constants[2].i = bottom_blob_unpacked.h;
+        constants[3].i = bottom_blob_unpacked.d;
+        constants[4].i = bottom_blob_unpacked.c;
+        constants[5].i = bottom_blob_unpacked.cstep;
+        constants[6].i = top_blob.dims;
+        constants[7].i = top_blob.w;
+        constants[8].i = top_blob.h;
+        constants[9].i = top_blob.d;
+        constants[10].i = top_blob.c;
+        constants[11].i = top_blob.cstep;
+        constants[12].i = left;
+        constants[13].i = top;
+        constants[14].i = front;
+
+        const Pipeline* pipeline = out_elempack == 4 ? pipeline_padding_3d_pack4_int8 : pipeline_padding_3d_int8;
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+        return 0;
+    }
+
+    std::vector<vk_constant_type> constants(13);
+    constants[0].i = bottom_blob_unpacked.dims;
+    constants[1].i = bottom_blob_unpacked.w;
+    constants[2].i = bottom_blob_unpacked.h;
+    constants[3].i = bottom_blob_unpacked.c;
+    constants[4].i = bottom_blob_unpacked.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+    constants[10].i = left;
+    constants[11].i = top;
+    constants[12].i = front;
+
+    const Pipeline* pipeline = 0;
+    if (offset_elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_padding_int8;
+    }
+    else if (offset_elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_padding_pack4_int8;
+    }
+    else if (offset_elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_padding_pack1to4_int8;
+    }
+    else if (offset_elempack == 4 && out_elempack == 1)
+    {
+        pipeline = pipeline_padding_pack4to1_int8;
+    }
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+int Padding_vulkan::forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkMat& bottom_blob = bottom_blobs[0];
+    const VkMat& reference_blob = bottom_blobs[1];
+
+    VkMat& top_blob = top_blobs[0];
+    int _top;
+    int _bottom;
+    int _left;
+    int _right;
+    int _front;
+    int _behind;
+    {
+        const int* param_data = reference_blob.mapped();
+
+        _top = param_data[0];
+        _bottom = param_data[1];
+        _left = param_data[2];
+        _right = param_data[3];
+        _front = param_data[4];
+        _behind = param_data[5];
+    }
+
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = 0;
+    int outh = 0;
+    int outd = 0;
+    int outc = 0;
+
+    int offset_elempack;
+    int out_elempack;
+
+    if (dims == 1)
+    {
+        if (_left == 0 && _right == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w * elempack + _left + _right;
+        out_elempack = outw % 4 == 0 ? 4 : 1;
+        offset_elempack = _left == 0 ? elempack : _left % 4 == 0 ? 4 : 1;
+    }
+    else if (dims == 2)
+    {
+        if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + _left + _right;
+        outh = h * elempack + _top + _bottom;
+        out_elempack = outh % 4 == 0 ? 4 : 1;
+        offset_elempack = _top == 0 ? elempack : _top % 4 == 0 ? 4 : 1;
+    }
+    else if (dims == 3)
+    {
+        if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0 && _front == 0 && _behind == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + _left + _right;
+        outh = h + _top + _bottom;
+        outc = channels * elempack + _front + _behind;
+        out_elempack = outc % 4 == 0 ? 4 : 1;
+        offset_elempack = _front == 0 ? elempack : _front % 4 == 0 ? 4 : 1;
+    }
+    else // if (dims == 4)
+    {
+        if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0 && _front == 0 && _behind == 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        outw = w + _left + _right;
+        outh = h + _top + _bottom;
+        outd = d + _front + _behind;
+        outc = channels * elempack;
+        out_elempack = elempack;
+        offset_elempack = elempack;
+    }
+
+    offset_elempack = std::min(offset_elempack, elempack);
+
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    // unpacking
+    VkMat bottom_blob_unpacked = bottom_blob;
+    if (elempack > offset_elempack)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1);
+    }
+
+    if (dims == 1)
+    {
+        top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else if (dims == 2)
+    {
+        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    else // if (dims == 4)
+    {
+        top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkMat> bindings(3);
+    bindings[0] = bottom_blob_unpacked;
+    bindings[1] = top_blob;
+    bindings[2] = per_channel_pad_data_int8_gpu;
+
+    if (dims == 4)
+    {
+        std::vector<vk_constant_type> constants(15);
+        constants[0].i = bottom_blob_unpacked.dims;
+        constants[1].i = bottom_blob_unpacked.w;
+        constants[2].i = bottom_blob_unpacked.h;
+        constants[3].i = bottom_blob_unpacked.d;
+        constants[4].i = bottom_blob_unpacked.c;
+        constants[5].i = bottom_blob_unpacked.cstep;
+        constants[6].i = top_blob.dims;
+        constants[7].i = top_blob.w;
+        constants[8].i = top_blob.h;
+        constants[9].i = top_blob.d;
+        constants[10].i = top_blob.c;
+        constants[11].i = top_blob.cstep;
+        constants[12].i = _left;
+        constants[13].i = _top;
+        constants[14].i = _front;
+
+        const Pipeline* pipeline = out_elempack == 4 ? pipeline_padding_3d_pack4_int8 : pipeline_padding_3d_int8;
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+        return 0;
+    }
+
+    std::vector<vk_constant_type> constants(13);
+    constants[0].i = bottom_blob_unpacked.dims;
+    constants[1].i = bottom_blob_unpacked.w;
+    constants[2].i = bottom_blob_unpacked.h;
+    constants[3].i = bottom_blob_unpacked.c;
+    constants[4].i = bottom_blob_unpacked.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+    constants[10].i = _left;
+    constants[11].i = _top;
+    constants[12].i = _front;
+
+    const Pipeline* pipeline = 0;
+    if (offset_elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_padding_int8;
+    }
+    else if (offset_elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_padding_pack4_int8;
+    }
+    else if (offset_elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_padding_pack1to4_int8;
+    }
+    else if (offset_elempack == 4 && out_elempack == 1)
+    {
+        pipeline = pipeline_padding_pack4to1_int8;
+    }
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h
index c2abf6ea96fa..4d73bc037bc1 100644
--- a/src/layer/vulkan/padding_vulkan.h
+++ b/src/layer/vulkan/padding_vulkan.h
@@ -23,9 +23,21 @@ class Padding_vulkan : public Padding
 
     virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+    int forward_int8(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+#endif // NCNN_INT8
+
 public:
     VkMat per_channel_pad_data_gpu;
 
+#if NCNN_INT8
+    Mat per_channel_pad_data_int8;
+    VkMat per_channel_pad_data_int8_gpu;
+#endif // NCNN_INT8
+
     Pipeline* pipeline_padding;
     Pipeline* pipeline_padding_pack4;
     Pipeline* pipeline_padding_pack1to4;
@@ -33,6 +45,16 @@ class Padding_vulkan : public Padding
 
     Pipeline* pipeline_padding_3d;
     Pipeline* pipeline_padding_3d_pack4;
+
+#if NCNN_INT8
+    Pipeline* pipeline_padding_int8;
+    Pipeline* pipeline_padding_pack4_int8;
+    Pipeline* pipeline_padding_pack1to4_int8;
+    Pipeline* pipeline_padding_pack4to1_int8;
+
+    Pipeline* pipeline_padding_3d_int8;
+    Pipeline* pipeline_padding_3d_pack4_int8;
+#endif // NCNN_INT8
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp b/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp
new file mode 100644
index 000000000000..b7a838f46a52
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_1x1s1d1_int8_cm.comp
@@ -0,0 +1,739 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_basic : require
+
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#if ncnn_VK_KHR_cooperative_matrix
+#extension GL_KHR_cooperative_matrix : require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix : require
+#extension GL_NV_integer_cooperative_matrix : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(constant_id = 5) const uint elempack = 1;
+layout(constant_id = 6) const uint out_elempack = 1;
+
+#define shape_constant_id_offset 7
+layout(constant_id = shape_constant_id_offset + 0) const uint cstep = 0;
+layout(constant_id = shape_constant_id_offset + 1) const uint outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 2) const uint size = 0;
+layout(constant_id = shape_constant_id_offset + 3) const uint num_output = 0;
+layout(constant_id = shape_constant_id_offset + 4) const uint num_input = 0;
+
+layout(constant_id = shape_constant_id_offset + 5 + 0) const uint M = 1;
+layout(constant_id = shape_constant_id_offset + 5 + 1) const uint N = 1;
+layout(constant_id = shape_constant_id_offset + 5 + 2) const uint K = 1;
+layout(constant_id = shape_constant_id_offset + 5 + 3) const uint subgroup_size = 32;
+layout(constant_id = shape_constant_id_offset + 5 + 4) const uint UNROLL_SG_M = 2;
+layout(constant_id = shape_constant_id_offset + 5 + 5) const uint UNROLL_SG_N = 2;
+layout(constant_id = shape_constant_id_offset + 5 + 6) const uint UNROLL_SG_K = 2;
+layout(constant_id = shape_constant_id_offset + 5 + 7) const uint UNROLL_WG_M = 2;
+layout(constant_id = shape_constant_id_offset + 5 + 8) const uint UNROLL_WG_N = 2;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+layout(binding = 3) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 4) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 5) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 6) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    uint cstep;
+    uint outcstep;
+    uint size;
+    uint num_output;
+    uint num_input;
+} p;
+
+const uint Nd4 = N / 4;
+const uint Kd4 = K / 4;
+const uint Md4 = M / 4;
+
+#if ncnn_VK_KHR_cooperative_matrix
+#define PAD 1
+#elif ncnn_VK_NV_cooperative_matrix
+#define PAD 0
+#endif
+
+const uint Nd4p = Nd4 + PAD;
+const uint Kd4p = Kd4 + PAD;
+
+shared int tmp_v[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p];
+shared int tmp_k[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p];
+shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4];
+
+void main()
+{
+    const uint wgi = gl_WorkGroupID.x;
+    const uint sgi = gl_SubgroupID;
+
+    const uint wgmm = (psc(size) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M);
+    const uint wgnn = (psc(num_output) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N);
+
+    const uint wgmi = wgi / wgnn;
+    const uint wgni = wgi % wgnn;
+
+    const uint sgmi = sgi / UNROLL_WG_N;
+    const uint sgni = sgi % UNROLL_WG_N;
+
+    const uint kk = (psc(num_input) + K - 1) / K;
+    const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+
+    if (wgmi >= wgmm)
+        return;
+
+    const uint si = gl_SubgroupInvocationID;
+
+    const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M;
+    const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N;
+
+#if ncnn_VK_KHR_cooperative_matrix
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum[UNROLL_SG_N][UNROLL_SG_M];
+#elif ncnn_VK_NV_cooperative_matrix
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M];
+#endif
+
+    {
+        [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                sum[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+#elif ncnn_VK_NV_cooperative_matrix
+                sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+#endif
+            }
+        }
+    }
+
+    uint k = 0;
+
+    if (kk >= UNROLL_SG_K * 2)
+    {
+        // local stack and shared memory ping-pong
+
+        // prefetch
+        int prefetch_tmp_v[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4p + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)];
+        int prefetch_tmp_k[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4p + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)];
+
+        // prefetch the very first
+        {
+            if (elempack == 1)
+            {
+                const uint cstepd4 = psc(cstep) / 4;
+
+                const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                    {
+                        const uint zk = siq / Md4_K_USGM;
+                        const uint zmij = siq % Md4_K_USGM;
+                        const uint zm = zmij / (Md4 * K);
+                        const uint ij = zmij % (Md4 * K);
+                        const uint i = ij / Md4;
+                        const uint j = ij % Md4;
+
+                        const uint gk = zk * K + i;
+                        const uint gm = (mi + zm) * Md4 + j;
+
+                        prefetch_tmp_v[q] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0;
+                    }
+                }
+            }
+            else // elempack == 4
+            {
+                const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                    {
+                        const uint zk = siq / M_Kd4p_USGM;
+                        const uint zmij = siq % M_Kd4p_USGM;
+                        const uint zmi = zmij / Kd4p;
+                        const uint j = zmij % Kd4p;
+
+                        const uint gm = mi * M + zmi;
+                        const uint gk = zk * K + j * 4;
+
+                        const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0;
+
+                        prefetch_tmp_v[q] = v;
+                    }
+                }
+            }
+        }
+        {
+            const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+            const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+            const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK;
+            const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq);
+                }
+            }
+        }
+
+        for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K)
+        {
+            // copy prefetched tile to shared memory
+            {
+                if (elempack == 1)
+                {
+                    const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                        }
+                    }
+                }
+            }
+            {
+                const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        tmp_k[sgni][siq] = prefetch_tmp_k[q];
+                    }
+                }
+            }
+
+            barrier();
+
+            // prefetch next tile
+            const uint ki = k + UNROLL_SG_K;
+            {
+                if (elempack == 1)
+                {
+                    const uint cstepd4 = psc(cstep) / 4;
+
+                    const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                    const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+
+                            prefetch_tmp_v[q] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0;
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                    const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zmi = zmij / Kd4p;
+                            const uint j = zmij % Kd4p;
+
+                            const uint gm = mi * M + zmi;
+                            const uint gk = (ki + zk) * K + j * 4;
+
+                            const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0;
+
+                            prefetch_tmp_v[q] = v;
+                        }
+                    }
+                }
+            }
+            {
+                const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+                const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+
+                const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq);
+                    }
+                }
+            }
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else // elempack == 4
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+
+        // copy and compute the last prefetched tile
+        {
+            if (elempack == 1)
+            {
+                const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K;
+                const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                    {
+                        tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                    }
+                }
+            }
+            else // elempack == 4
+            {
+                const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K;
+                const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                    {
+                        tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                    }
+                }
+            }
+        }
+        {
+            const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K;
+            const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    tmp_k[sgni][siq] = prefetch_tmp_k[q];
+                }
+            }
+        }
+
+        barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+        coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+        coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+        icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+        icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+        [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+                if (elempack == 1)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                }
+                else // elempack == 4
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                }
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                }
+            }
+        }
+
+        barrier();
+    }
+    else
+    {
+        for (uint ki = 0; ki < kk; ki += UNROLL_SG_K)
+        {
+            {
+                if (elempack == 1)
+                {
+                    const uint cstepd4 = psc(cstep) / 4;
+
+                    const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                    const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+
+                            tmp_v[sgmi][siq] = gk < psc(num_input) && gm < cstepd4 ? i8buffer_sm4(bottom_blob_int8_data, gk * cstepd4 + gm) : 0;
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                    const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zm = zmij / (M * Kd4p);
+                            const uint ij = zmij % (M * Kd4p);
+                            const uint i = ij / Kd4p;
+                            const uint j = ij % Kd4p;
+
+                            const uint gm = (mi + zm) * M + i;
+                            const uint gk = (ki + zk) * K + j * 4;
+                            const int v = gm < psc(size) && gk < psc(num_input) ? i8buffer_sm4(bottom_blob_int8_data, (gk / 4) * psc(cstep) + gm) : 0;
+
+                            tmp_v[sgmi][siq] = v;
+                        }
+                    }
+                }
+            }
+
+            {
+                const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+                const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        tmp_k[sgni][siq] = i8buffer_sm4(weight_data, w_offset + siq);
+                    }
+                }
+            }
+
+            barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else // elempack == 4
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+    }
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+            if (out_elempack == 1)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true);
+#endif
+            }
+            else // out_elempack == 4
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false);
+#endif
+            }
+        }
+    }
+
+    barrier();
+
+    if (out_elempack == 1)
+    {
+        const uint outcstepd4 = psc(outcstep) / 4;
+        const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N;
+        const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN)
+            {
+                const uint zn = siq / (Md4 * N * UNROLL_SG_M);
+                const uint zmij = siq % (Md4 * N * UNROLL_SG_M);
+                const uint zm = zmij / (Md4 * N);
+                const uint ij = zmij % (Md4 * N);
+                const uint i = ij / Md4;
+                const uint j = ij % Md4;
+
+                const uint gn = (ni + zn) * N + i;
+                const uint gm = (mi + zm) * Md4 + j;
+
+                if (gn < psc(num_output) && gm * 4 < psc(size))
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    const int gn4 = int(gn % 4);
+                    vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4][gn4];
+
+                    if (bias_term == 1)
+                    {
+                        sumfp += bias_data[gn / 4][gn4];
+                    }
+
+                    sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1));
+
+                    if (use_int8_requantize == 1)
+                    {
+                        const float top_scale = float(buffer_ld1(top_scales_data, 0));
+                        sumfp *= top_scale;
+                        i8buffer_st4(top_blob_int8_data, gn * outcstepd4 + gm, float2int8vec4(sumfp));
+                    }
+                    else
+                    {
+                        buffer_st4(top_blob_data, gn * outcstepd4 + gm, afpvec4(sumfp));
+                    }
+                }
+            }
+        }
+    }
+    else // out_elempack == 4
+    {
+        const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N;
+        const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN)
+            {
+                const uint zn = siq / (M * Nd4 * UNROLL_SG_M);
+                const uint zmij = siq % (M * Nd4 * UNROLL_SG_M);
+                const uint zm = zmij / (M * Nd4);
+                const uint ij = zmij % (M * Nd4);
+                const uint i = ij / Nd4;
+                const uint j = ij % Nd4;
+
+                const uint gm = (mi + zm) * M + i;
+                const uint gn = (ni + zn) * N + j * 4;
+
+                if (gm < psc(size) && gn < psc(num_output))
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4];
+
+                    if (bias_term == 1)
+                    {
+                        sumfp += bias_data[gn / 4];
+                    }
+
+                    sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1));
+
+                    if (use_int8_requantize == 1)
+                    {
+                        const float top_scale = float(buffer_ld1(top_scales_data, 0));
+                        sumfp *= top_scale;
+                        i8buffer_st4(top_blob_int8_data, (gn / 4) * psc(outcstep) + gm, float2int8vec4(sumfp));
+                    }
+                    else
+                    {
+                        buffer_st4(top_blob_data, (gn / 4) * psc(outcstep) + gm, afpvec4(sumfp));
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp
new file mode 100644
index 000000000000..1607abb48fec
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8.comp
@@ -0,0 +1,131 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c)
+        return;
+
+    // load 4x4
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0);
+    int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0;
+    int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0;
+    int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0;
+
+    int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0;
+    int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0;
+    int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0;
+    int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0;
+
+    int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0;
+    int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0;
+    int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0;
+    int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0;
+
+    int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0;
+    int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0;
+    int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0;
+    int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0;
+
+    // implicit transpose
+    int m00 = v00 - v02;
+    int m01 = v10 - v12;
+    int m02 = v20 - v22;
+    int m03 = v30 - v32;
+
+    int m10 = v02 + v01;
+    int m11 = v12 + v11;
+    int m12 = v22 + v21;
+    int m13 = v32 + v31;
+
+    int m20 = v02 - v01;
+    int m21 = v12 - v11;
+    int m22 = v22 - v21;
+    int m23 = v32 - v31;
+
+    int m30 = v03 - v01;
+    int m31 = v13 - v11;
+    int m32 = v23 - v21;
+    int m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m02 + m01;
+    v11 = m12 + m11;
+    v21 = m22 + m21;
+    v31 = m32 + m31;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+    int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx;
+    int v_tm_step = psc(outcstep) * c;
+
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v10);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v11);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v12);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v13);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v20);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v21);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v22);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v23);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v30);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v31);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v32);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v33);
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp
new file mode 100644
index 000000000000..0249e01fac1d
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input_int8_cm.comp
@@ -0,0 +1,180 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_low_blob { sint8 bottom_tm_low_data[]; };
+layout(binding = 2) writeonly buffer bottom_tm_high_blob { sint8 bottom_tm_high_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c)
+        return;
+
+    // load 4x4
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0);
+    int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0;
+    int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0;
+    int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0;
+
+    int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0;
+    int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0;
+    int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0;
+    int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0;
+
+    int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0;
+    int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0;
+    int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0;
+    int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0;
+
+    int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0;
+    int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0;
+    int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0;
+    int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0;
+
+    // implicit transpose
+    int m00 = v00 - v02;
+    int m01 = v10 - v12;
+    int m02 = v20 - v22;
+    int m03 = v30 - v32;
+
+    int m10 = v02 + v01;
+    int m11 = v12 + v11;
+    int m12 = v22 + v21;
+    int m13 = v32 + v31;
+
+    int m20 = v02 - v01;
+    int m21 = v12 - v11;
+    int m22 = v22 - v21;
+    int m23 = v32 - v31;
+
+    int m30 = v03 - v01;
+    int m31 = v13 - v11;
+    int m32 = v23 - v21;
+    int m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m02 + m01;
+    v11 = m12 + m11;
+    v21 = m22 + m21;
+    v31 = m32 + m31;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+    int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx;
+    int v_tm_step = psc(outcstep) * c;
+
+    int v00_low = v00 & 255;
+    v00_low = v00_low >= 128 ? v00_low - 256 : v00_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 0 * v_tm_step, v00_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 0 * v_tm_step, (v00 - v00_low) >> 8);
+    int v01_low = v01 & 255;
+    v01_low = v01_low >= 128 ? v01_low - 256 : v01_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 1 * v_tm_step, v01_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 1 * v_tm_step, (v01 - v01_low) >> 8);
+    int v02_low = v02 & 255;
+    v02_low = v02_low >= 128 ? v02_low - 256 : v02_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 2 * v_tm_step, v02_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 2 * v_tm_step, (v02 - v02_low) >> 8);
+    int v03_low = v03 & 255;
+    v03_low = v03_low >= 128 ? v03_low - 256 : v03_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 3 * v_tm_step, v03_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 3 * v_tm_step, (v03 - v03_low) >> 8);
+    int v10_low = v10 & 255;
+    v10_low = v10_low >= 128 ? v10_low - 256 : v10_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 4 * v_tm_step, v10_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 4 * v_tm_step, (v10 - v10_low) >> 8);
+    int v11_low = v11 & 255;
+    v11_low = v11_low >= 128 ? v11_low - 256 : v11_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 5 * v_tm_step, v11_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 5 * v_tm_step, (v11 - v11_low) >> 8);
+    int v12_low = v12 & 255;
+    v12_low = v12_low >= 128 ? v12_low - 256 : v12_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 6 * v_tm_step, v12_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 6 * v_tm_step, (v12 - v12_low) >> 8);
+    int v13_low = v13 & 255;
+    v13_low = v13_low >= 128 ? v13_low - 256 : v13_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 7 * v_tm_step, v13_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 7 * v_tm_step, (v13 - v13_low) >> 8);
+    int v20_low = v20 & 255;
+    v20_low = v20_low >= 128 ? v20_low - 256 : v20_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 8 * v_tm_step, v20_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 8 * v_tm_step, (v20 - v20_low) >> 8);
+    int v21_low = v21 & 255;
+    v21_low = v21_low >= 128 ? v21_low - 256 : v21_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 9 * v_tm_step, v21_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 9 * v_tm_step, (v21 - v21_low) >> 8);
+    int v22_low = v22 & 255;
+    v22_low = v22_low >= 128 ? v22_low - 256 : v22_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 10 * v_tm_step, v22_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 10 * v_tm_step, (v22 - v22_low) >> 8);
+    int v23_low = v23 & 255;
+    v23_low = v23_low >= 128 ? v23_low - 256 : v23_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 11 * v_tm_step, v23_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 11 * v_tm_step, (v23 - v23_low) >> 8);
+    int v30_low = v30 & 255;
+    v30_low = v30_low >= 128 ? v30_low - 256 : v30_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 12 * v_tm_step, v30_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 12 * v_tm_step, (v30 - v30_low) >> 8);
+    int v31_low = v31 & 255;
+    v31_low = v31_low >= 128 ? v31_low - 256 : v31_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 13 * v_tm_step, v31_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 13 * v_tm_step, (v31 - v31_low) >> 8);
+    int v32_low = v32 & 255;
+    v32_low = v32_low >= 128 ? v32_low - 256 : v32_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 14 * v_tm_step, v32_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 14 * v_tm_step, (v32 - v32_low) >> 8);
+    int v33_low = v33 & 255;
+    v33_low = v33_low >= 128 ? v33_low - 256 : v33_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 15 * v_tm_step, v33_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 15 * v_tm_step, (v33 - v33_low) >> 8);
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp
new file mode 100644
index 000000000000..206409c65f9c
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output_int8.comp
@@ -0,0 +1,140 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(binding = 0) readonly buffer top_tm_blob { int top_tm_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 2) readonly buffer bias_blob { float bias_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { float weight_descales_data[]; };
+layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 5) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+    int outc;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= p.outc)
+        return;
+
+    // load 16
+    int v_tm_offset = gz * p.cstep + gy * p.block_x + gx;
+    int v_tm_step = p.cstep * p.outc;
+
+    int v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step];
+    int v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step];
+    int v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step];
+    int v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step];
+    int v10 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step];
+    int v11 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step];
+    int v12 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step];
+    int v13 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step];
+    int v20 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step];
+    int v21 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step];
+    int v22 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step];
+    int v23 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step];
+    int v30 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step];
+    int v31 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step];
+    int v32 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step];
+    int v33 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step];
+
+    int m00 = v00 + v01 + v02;
+    int m01 = v10 + v11 + v12;
+    int m02 = v20 + v21 + v22;
+    int m03 = v30 + v31 + v32;
+
+    int m10 = v01 - v02 + v03;
+    int m11 = v11 - v12 + v13;
+    int m12 = v21 - v22 + v23;
+    int m13 = v31 - v32 + v33;
+
+    const float descale = weight_descales_data[gz] * 0.25f;
+
+    float out00 = float(m00 + m01 + m02) * descale;
+    float out01 = float(m01 - m02 + m03) * descale;
+    float out10 = float(m10 + m11 + m12) * descale;
+    float out11 = float(m11 - m12 + m13) * descale;
+
+    if (bias_term == 1)
+    {
+        const float bias_value = bias_data[gz];
+
+        out00 += bias_value;
+        out01 += bias_value;
+        out10 += bias_value;
+        out11 += bias_value;
+    }
+
+    out00 = float(activation_afp(afp(out00), activation_type, activation_param_0, activation_param_1));
+    out01 = float(activation_afp(afp(out01), activation_type, activation_param_0, activation_param_1));
+    out10 = float(activation_afp(afp(out10), activation_type, activation_param_0, activation_param_1));
+    out11 = float(activation_afp(afp(out11), activation_type, activation_param_0, activation_param_1));
+
+    // store 2x2
+    int x = gx * 2;
+    int y = gy * 2;
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+
+        out00 *= top_scale;
+        out01 *= top_scale;
+        out10 *= top_scale;
+        out11 *= top_scale;
+
+        int out00_int8 = float2int8(out00);
+        int out01_int8 = float2int8(out01);
+        int out10_int8 = float2int8(out10);
+        int out11_int8 = float2int8(out11);
+
+        int v_offset_0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset_1 = v_offset_0 + p.outw;
+
+        i8buffer_st1(top_blob_int8_data, v_offset_0 + 0, out00_int8);
+        if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset_0 + 1, out01_int8);
+
+        if (y + 1 < p.outh)
+        {
+            i8buffer_st1(top_blob_int8_data, v_offset_1 + 0, out10_int8);
+            if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset_1 + 1, out11_int8);
+        }
+    }
+    else
+    {
+        int v_offset_0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset_1 = v_offset_0 + p.outw;
+
+        buffer_st1(top_blob_data, v_offset_0 + 0, afp(out00));
+        if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset_0 + 1, afp(out01));
+
+        if (y + 1 < p.outh)
+        {
+            buffer_st1(top_blob_data, v_offset_1 + 0, afp(out10));
+            if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset_1 + 1, afp(out11));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp
new file mode 100644
index 000000000000..578079b2aca7
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8.comp
@@ -0,0 +1,219 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c)
+        return;
+
+    // load 6x6
+    int sx = gx * 4;
+    int sy = gy * 4;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+    ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w);
+
+    int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0);
+    int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0;
+    int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0;
+    int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0;
+    int v04 = sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 4) : 0;
+    int v05 = sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 5) : 0;
+
+    int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0;
+    int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0;
+    int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0;
+    int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0;
+    int v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 4) : 0;
+    int v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 5) : 0;
+
+    int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0;
+    int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0;
+    int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0;
+    int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0;
+    int v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 4) : 0;
+    int v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 5) : 0;
+
+    int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0;
+    int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0;
+    int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0;
+    int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0;
+    int v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 4) : 0;
+    int v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 5) : 0;
+
+    int v40 = sy + 4 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 0) : 0;
+    int v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 1) : 0;
+    int v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 2) : 0;
+    int v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 3) : 0;
+    int v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 4) : 0;
+    int v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 5) : 0;
+
+    int v50 = sy + 5 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 0) : 0;
+    int v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 1) : 0;
+    int v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 2) : 0;
+    int v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 3) : 0;
+    int v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 4) : 0;
+    int v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 5) : 0;
+
+    // implicit transpose
+    int m00 = v04 + v00 * 4 - v02 * 5;
+    int m01 = v14 + v10 * 4 - v12 * 5;
+    int m02 = v24 + v20 * 4 - v22 * 5;
+    int m03 = v34 + v30 * 4 - v32 * 5;
+    int m04 = v44 + v40 * 4 - v42 * 5;
+    int m05 = v54 + v50 * 4 - v52 * 5;
+
+    int m10 = (v04 - v02 * 4) + (v03 - v01 * 4);
+    int m11 = (v14 - v12 * 4) + (v13 - v11 * 4);
+    int m12 = (v24 - v22 * 4) + (v23 - v21 * 4);
+    int m13 = (v34 - v32 * 4) + (v33 - v31 * 4);
+    int m14 = (v44 - v42 * 4) + (v43 - v41 * 4);
+    int m15 = (v54 - v52 * 4) + (v53 - v51 * 4);
+
+    int m20 = (v04 - v02 * 4) - (v03 - v01 * 4);
+    int m21 = (v14 - v12 * 4) - (v13 - v11 * 4);
+    int m22 = (v24 - v22 * 4) - (v23 - v21 * 4);
+    int m23 = (v34 - v32 * 4) - (v33 - v31 * 4);
+    int m24 = (v44 - v42 * 4) - (v43 - v41 * 4);
+    int m25 = (v54 - v52 * 4) - (v53 - v51 * 4);
+
+    int m30 = (v04 - v02) + (v03 - v01) * 2;
+    int m31 = (v14 - v12) + (v13 - v11) * 2;
+    int m32 = (v24 - v22) + (v23 - v21) * 2;
+    int m33 = (v34 - v32) + (v33 - v31) * 2;
+    int m34 = (v44 - v42) + (v43 - v41) * 2;
+    int m35 = (v54 - v52) + (v53 - v51) * 2;
+
+    int m40 = (v04 - v02) - (v03 - v01) * 2;
+    int m41 = (v14 - v12) - (v13 - v11) * 2;
+    int m42 = (v24 - v22) - (v23 - v21) * 2;
+    int m43 = (v34 - v32) - (v33 - v31) * 2;
+    int m44 = (v44 - v42) - (v43 - v41) * 2;
+    int m45 = (v54 - v52) - (v53 - v51) * 2;
+
+    int m50 = v05 + v01 * 4 - v03 * 5;
+    int m51 = v15 + v11 * 4 - v13 * 5;
+    int m52 = v25 + v21 * 4 - v23 * 5;
+    int m53 = v35 + v31 * 4 - v33 * 5;
+    int m54 = v45 + v41 * 4 - v43 * 5;
+    int m55 = v55 + v51 * 4 - v53 * 5;
+
+    v00 = m04 + m00 * 4 - m02 * 5;
+    v10 = m14 + m10 * 4 - m12 * 5;
+    v20 = m24 + m20 * 4 - m22 * 5;
+    v30 = m34 + m30 * 4 - m32 * 5;
+    v40 = m44 + m40 * 4 - m42 * 5;
+    v50 = m54 + m50 * 4 - m52 * 5;
+
+    v01 = (m04 - m02 * 4) + (m03 - m01 * 4);
+    v11 = (m14 - m12 * 4) + (m13 - m11 * 4);
+    v21 = (m24 - m22 * 4) + (m23 - m21 * 4);
+    v31 = (m34 - m32 * 4) + (m33 - m31 * 4);
+    v41 = (m44 - m42 * 4) + (m43 - m41 * 4);
+    v51 = (m54 - m52 * 4) + (m53 - m51 * 4);
+
+    v02 = (m04 - m02 * 4) - (m03 - m01 * 4);
+    v12 = (m14 - m12 * 4) - (m13 - m11 * 4);
+    v22 = (m24 - m22 * 4) - (m23 - m21 * 4);
+    v32 = (m34 - m32 * 4) - (m33 - m31 * 4);
+    v42 = (m44 - m42 * 4) - (m43 - m41 * 4);
+    v52 = (m54 - m52 * 4) - (m53 - m51 * 4);
+
+    v03 = (m04 - m02) + (m03 - m01) * 2;
+    v13 = (m14 - m12) + (m13 - m11) * 2;
+    v23 = (m24 - m22) + (m23 - m21) * 2;
+    v33 = (m34 - m32) + (m33 - m31) * 2;
+    v43 = (m44 - m42) + (m43 - m41) * 2;
+    v53 = (m54 - m52) + (m53 - m51) * 2;
+
+    v04 = (m04 - m02) - (m03 - m01) * 2;
+    v14 = (m14 - m12) - (m13 - m11) * 2;
+    v24 = (m24 - m22) - (m23 - m21) * 2;
+    v34 = (m34 - m32) - (m33 - m31) * 2;
+    v44 = (m44 - m42) - (m43 - m41) * 2;
+    v54 = (m54 - m52) - (m53 - m51) * 2;
+
+    v05 = m05 + m01 * 4 - m03 * 5;
+    v15 = m15 + m11 * 4 - m13 * 5;
+    v25 = m25 + m21 * 4 - m23 * 5;
+    v35 = m35 + m31 * 4 - m33 * 5;
+    v45 = m45 + m41 * 4 - m43 * 5;
+    v55 = m55 + m51 * 4 - m53 * 5;
+
+    // store 36
+    int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx;
+    int v_tm_step = psc(outcstep) * c;
+
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v04);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v05);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v10);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v11);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v12);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v13);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v14);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v15);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v20);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v21);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v22);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v23);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 16 * v_tm_step, v24);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 17 * v_tm_step, v25);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 18 * v_tm_step, v30);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 19 * v_tm_step, v31);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 20 * v_tm_step, v32);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 21 * v_tm_step, v33);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 22 * v_tm_step, v34);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 23 * v_tm_step, v35);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 24 * v_tm_step, v40);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 25 * v_tm_step, v41);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 26 * v_tm_step, v42);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 27 * v_tm_step, v43);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 28 * v_tm_step, v44);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 29 * v_tm_step, v45);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 30 * v_tm_step, v50);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 31 * v_tm_step, v51);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 32 * v_tm_step, v52);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 33 * v_tm_step, v53);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 34 * v_tm_step, v54);
+    i16buffer_st1(bottom_tm_blob_data, v_tm_offset + 35 * v_tm_step, v55);
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp
new file mode 100644
index 000000000000..1e82a72d20e3
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input_int8_cm.comp
@@ -0,0 +1,328 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_low_blob { sint8 bottom_tm_low_data[]; };
+layout(binding = 2) writeonly buffer bottom_tm_high_blob { sint8 bottom_tm_high_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= c)
+        return;
+
+    // load 6x6
+    int sx = gx * 4;
+    int sy = gy * 4;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+    ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w);
+
+    int v00 = i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 0);
+    int v01 = sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 1) : 0;
+    int v02 = sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 2) : 0;
+    int v03 = sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 3) : 0;
+    int v04 = sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 4) : 0;
+    int v05 = sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.r + 5) : 0;
+
+    int v10 = sy + 1 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 0) : 0;
+    int v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 1) : 0;
+    int v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 2) : 0;
+    int v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 3) : 0;
+    int v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 4) : 0;
+    int v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.g + 5) : 0;
+
+    int v20 = sy + 2 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 0) : 0;
+    int v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 1) : 0;
+    int v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 2) : 0;
+    int v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 3) : 0;
+    int v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 4) : 0;
+    int v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.b + 5) : 0;
+
+    int v30 = sy + 3 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 0) : 0;
+    int v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 1) : 0;
+    int v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 2) : 0;
+    int v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 3) : 0;
+    int v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 4) : 0;
+    int v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset.a + 5) : 0;
+
+    int v40 = sy + 4 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 0) : 0;
+    int v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 1) : 0;
+    int v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 2) : 0;
+    int v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 3) : 0;
+    int v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 4) : 0;
+    int v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.x + 5) : 0;
+
+    int v50 = sy + 5 < psc(h) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 0) : 0;
+    int v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 1) : 0;
+    int v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 2) : 0;
+    int v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 3) : 0;
+    int v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 4) : 0;
+    int v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld1(bottom_blob_int8_data, v_offset45.y + 5) : 0;
+
+    // implicit transpose
+    int m00 = v04 + v00 * 4 - v02 * 5;
+    int m01 = v14 + v10 * 4 - v12 * 5;
+    int m02 = v24 + v20 * 4 - v22 * 5;
+    int m03 = v34 + v30 * 4 - v32 * 5;
+    int m04 = v44 + v40 * 4 - v42 * 5;
+    int m05 = v54 + v50 * 4 - v52 * 5;
+
+    int m10 = (v04 - v02 * 4) + (v03 - v01 * 4);
+    int m11 = (v14 - v12 * 4) + (v13 - v11 * 4);
+    int m12 = (v24 - v22 * 4) + (v23 - v21 * 4);
+    int m13 = (v34 - v32 * 4) + (v33 - v31 * 4);
+    int m14 = (v44 - v42 * 4) + (v43 - v41 * 4);
+    int m15 = (v54 - v52 * 4) + (v53 - v51 * 4);
+
+    int m20 = (v04 - v02 * 4) - (v03 - v01 * 4);
+    int m21 = (v14 - v12 * 4) - (v13 - v11 * 4);
+    int m22 = (v24 - v22 * 4) - (v23 - v21 * 4);
+    int m23 = (v34 - v32 * 4) - (v33 - v31 * 4);
+    int m24 = (v44 - v42 * 4) - (v43 - v41 * 4);
+    int m25 = (v54 - v52 * 4) - (v53 - v51 * 4);
+
+    int m30 = (v04 - v02) + (v03 - v01) * 2;
+    int m31 = (v14 - v12) + (v13 - v11) * 2;
+    int m32 = (v24 - v22) + (v23 - v21) * 2;
+    int m33 = (v34 - v32) + (v33 - v31) * 2;
+    int m34 = (v44 - v42) + (v43 - v41) * 2;
+    int m35 = (v54 - v52) + (v53 - v51) * 2;
+
+    int m40 = (v04 - v02) - (v03 - v01) * 2;
+    int m41 = (v14 - v12) - (v13 - v11) * 2;
+    int m42 = (v24 - v22) - (v23 - v21) * 2;
+    int m43 = (v34 - v32) - (v33 - v31) * 2;
+    int m44 = (v44 - v42) - (v43 - v41) * 2;
+    int m45 = (v54 - v52) - (v53 - v51) * 2;
+
+    int m50 = v05 + v01 * 4 - v03 * 5;
+    int m51 = v15 + v11 * 4 - v13 * 5;
+    int m52 = v25 + v21 * 4 - v23 * 5;
+    int m53 = v35 + v31 * 4 - v33 * 5;
+    int m54 = v45 + v41 * 4 - v43 * 5;
+    int m55 = v55 + v51 * 4 - v53 * 5;
+
+    v00 = m04 + m00 * 4 - m02 * 5;
+    v10 = m14 + m10 * 4 - m12 * 5;
+    v20 = m24 + m20 * 4 - m22 * 5;
+    v30 = m34 + m30 * 4 - m32 * 5;
+    v40 = m44 + m40 * 4 - m42 * 5;
+    v50 = m54 + m50 * 4 - m52 * 5;
+
+    v01 = (m04 - m02 * 4) + (m03 - m01 * 4);
+    v11 = (m14 - m12 * 4) + (m13 - m11 * 4);
+    v21 = (m24 - m22 * 4) + (m23 - m21 * 4);
+    v31 = (m34 - m32 * 4) + (m33 - m31 * 4);
+    v41 = (m44 - m42 * 4) + (m43 - m41 * 4);
+    v51 = (m54 - m52 * 4) + (m53 - m51 * 4);
+
+    v02 = (m04 - m02 * 4) - (m03 - m01 * 4);
+    v12 = (m14 - m12 * 4) - (m13 - m11 * 4);
+    v22 = (m24 - m22 * 4) - (m23 - m21 * 4);
+    v32 = (m34 - m32 * 4) - (m33 - m31 * 4);
+    v42 = (m44 - m42 * 4) - (m43 - m41 * 4);
+    v52 = (m54 - m52 * 4) - (m53 - m51 * 4);
+
+    v03 = (m04 - m02) + (m03 - m01) * 2;
+    v13 = (m14 - m12) + (m13 - m11) * 2;
+    v23 = (m24 - m22) + (m23 - m21) * 2;
+    v33 = (m34 - m32) + (m33 - m31) * 2;
+    v43 = (m44 - m42) + (m43 - m41) * 2;
+    v53 = (m54 - m52) + (m53 - m51) * 2;
+
+    v04 = (m04 - m02) - (m03 - m01) * 2;
+    v14 = (m14 - m12) - (m13 - m11) * 2;
+    v24 = (m24 - m22) - (m23 - m21) * 2;
+    v34 = (m34 - m32) - (m33 - m31) * 2;
+    v44 = (m44 - m42) - (m43 - m41) * 2;
+    v54 = (m54 - m52) - (m53 - m51) * 2;
+
+    v05 = m05 + m01 * 4 - m03 * 5;
+    v15 = m15 + m11 * 4 - m13 * 5;
+    v25 = m25 + m21 * 4 - m23 * 5;
+    v35 = m35 + m31 * 4 - m33 * 5;
+    v45 = m45 + m41 * 4 - m43 * 5;
+    v55 = m55 + m51 * 4 - m53 * 5;
+
+    // store 36
+    int v_tm_offset = gz * psc(outcstep) + gy * psc(block_x) + gx;
+    int v_tm_step = psc(outcstep) * c;
+
+    int v00_low = v00 & 255;
+    v00_low = v00_low >= 128 ? v00_low - 256 : v00_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 0 * v_tm_step, v00_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 0 * v_tm_step, (v00 - v00_low) >> 8);
+    int v01_low = v01 & 255;
+    v01_low = v01_low >= 128 ? v01_low - 256 : v01_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 1 * v_tm_step, v01_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 1 * v_tm_step, (v01 - v01_low) >> 8);
+    int v02_low = v02 & 255;
+    v02_low = v02_low >= 128 ? v02_low - 256 : v02_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 2 * v_tm_step, v02_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 2 * v_tm_step, (v02 - v02_low) >> 8);
+    int v03_low = v03 & 255;
+    v03_low = v03_low >= 128 ? v03_low - 256 : v03_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 3 * v_tm_step, v03_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 3 * v_tm_step, (v03 - v03_low) >> 8);
+    int v04_low = v04 & 255;
+    v04_low = v04_low >= 128 ? v04_low - 256 : v04_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 4 * v_tm_step, v04_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 4 * v_tm_step, (v04 - v04_low) >> 8);
+    int v05_low = v05 & 255;
+    v05_low = v05_low >= 128 ? v05_low - 256 : v05_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 5 * v_tm_step, v05_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 5 * v_tm_step, (v05 - v05_low) >> 8);
+    int v10_low = v10 & 255;
+    v10_low = v10_low >= 128 ? v10_low - 256 : v10_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 6 * v_tm_step, v10_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 6 * v_tm_step, (v10 - v10_low) >> 8);
+    int v11_low = v11 & 255;
+    v11_low = v11_low >= 128 ? v11_low - 256 : v11_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 7 * v_tm_step, v11_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 7 * v_tm_step, (v11 - v11_low) >> 8);
+    int v12_low = v12 & 255;
+    v12_low = v12_low >= 128 ? v12_low - 256 : v12_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 8 * v_tm_step, v12_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 8 * v_tm_step, (v12 - v12_low) >> 8);
+    int v13_low = v13 & 255;
+    v13_low = v13_low >= 128 ? v13_low - 256 : v13_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 9 * v_tm_step, v13_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 9 * v_tm_step, (v13 - v13_low) >> 8);
+    int v14_low = v14 & 255;
+    v14_low = v14_low >= 128 ? v14_low - 256 : v14_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 10 * v_tm_step, v14_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 10 * v_tm_step, (v14 - v14_low) >> 8);
+    int v15_low = v15 & 255;
+    v15_low = v15_low >= 128 ? v15_low - 256 : v15_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 11 * v_tm_step, v15_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 11 * v_tm_step, (v15 - v15_low) >> 8);
+    int v20_low = v20 & 255;
+    v20_low = v20_low >= 128 ? v20_low - 256 : v20_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 12 * v_tm_step, v20_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 12 * v_tm_step, (v20 - v20_low) >> 8);
+    int v21_low = v21 & 255;
+    v21_low = v21_low >= 128 ? v21_low - 256 : v21_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 13 * v_tm_step, v21_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 13 * v_tm_step, (v21 - v21_low) >> 8);
+    int v22_low = v22 & 255;
+    v22_low = v22_low >= 128 ? v22_low - 256 : v22_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 14 * v_tm_step, v22_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 14 * v_tm_step, (v22 - v22_low) >> 8);
+    int v23_low = v23 & 255;
+    v23_low = v23_low >= 128 ? v23_low - 256 : v23_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 15 * v_tm_step, v23_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 15 * v_tm_step, (v23 - v23_low) >> 8);
+    int v24_low = v24 & 255;
+    v24_low = v24_low >= 128 ? v24_low - 256 : v24_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 16 * v_tm_step, v24_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 16 * v_tm_step, (v24 - v24_low) >> 8);
+    int v25_low = v25 & 255;
+    v25_low = v25_low >= 128 ? v25_low - 256 : v25_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 17 * v_tm_step, v25_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 17 * v_tm_step, (v25 - v25_low) >> 8);
+    int v30_low = v30 & 255;
+    v30_low = v30_low >= 128 ? v30_low - 256 : v30_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 18 * v_tm_step, v30_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 18 * v_tm_step, (v30 - v30_low) >> 8);
+    int v31_low = v31 & 255;
+    v31_low = v31_low >= 128 ? v31_low - 256 : v31_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 19 * v_tm_step, v31_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 19 * v_tm_step, (v31 - v31_low) >> 8);
+    int v32_low = v32 & 255;
+    v32_low = v32_low >= 128 ? v32_low - 256 : v32_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 20 * v_tm_step, v32_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 20 * v_tm_step, (v32 - v32_low) >> 8);
+    int v33_low = v33 & 255;
+    v33_low = v33_low >= 128 ? v33_low - 256 : v33_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 21 * v_tm_step, v33_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 21 * v_tm_step, (v33 - v33_low) >> 8);
+    int v34_low = v34 & 255;
+    v34_low = v34_low >= 128 ? v34_low - 256 : v34_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 22 * v_tm_step, v34_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 22 * v_tm_step, (v34 - v34_low) >> 8);
+    int v35_low = v35 & 255;
+    v35_low = v35_low >= 128 ? v35_low - 256 : v35_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 23 * v_tm_step, v35_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 23 * v_tm_step, (v35 - v35_low) >> 8);
+    int v40_low = v40 & 255;
+    v40_low = v40_low >= 128 ? v40_low - 256 : v40_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 24 * v_tm_step, v40_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 24 * v_tm_step, (v40 - v40_low) >> 8);
+    int v41_low = v41 & 255;
+    v41_low = v41_low >= 128 ? v41_low - 256 : v41_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 25 * v_tm_step, v41_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 25 * v_tm_step, (v41 - v41_low) >> 8);
+    int v42_low = v42 & 255;
+    v42_low = v42_low >= 128 ? v42_low - 256 : v42_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 26 * v_tm_step, v42_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 26 * v_tm_step, (v42 - v42_low) >> 8);
+    int v43_low = v43 & 255;
+    v43_low = v43_low >= 128 ? v43_low - 256 : v43_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 27 * v_tm_step, v43_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 27 * v_tm_step, (v43 - v43_low) >> 8);
+    int v44_low = v44 & 255;
+    v44_low = v44_low >= 128 ? v44_low - 256 : v44_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 28 * v_tm_step, v44_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 28 * v_tm_step, (v44 - v44_low) >> 8);
+    int v45_low = v45 & 255;
+    v45_low = v45_low >= 128 ? v45_low - 256 : v45_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 29 * v_tm_step, v45_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 29 * v_tm_step, (v45 - v45_low) >> 8);
+    int v50_low = v50 & 255;
+    v50_low = v50_low >= 128 ? v50_low - 256 : v50_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 30 * v_tm_step, v50_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 30 * v_tm_step, (v50 - v50_low) >> 8);
+    int v51_low = v51 & 255;
+    v51_low = v51_low >= 128 ? v51_low - 256 : v51_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 31 * v_tm_step, v51_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 31 * v_tm_step, (v51 - v51_low) >> 8);
+    int v52_low = v52 & 255;
+    v52_low = v52_low >= 128 ? v52_low - 256 : v52_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 32 * v_tm_step, v52_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 32 * v_tm_step, (v52 - v52_low) >> 8);
+    int v53_low = v53 & 255;
+    v53_low = v53_low >= 128 ? v53_low - 256 : v53_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 33 * v_tm_step, v53_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 33 * v_tm_step, (v53 - v53_low) >> 8);
+    int v54_low = v54 & 255;
+    v54_low = v54_low >= 128 ? v54_low - 256 : v54_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 34 * v_tm_step, v54_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 34 * v_tm_step, (v54 - v54_low) >> 8);
+    int v55_low = v55 & 255;
+    v55_low = v55_low >= 128 ? v55_low - 256 : v55_low;
+    i8buffer_st1(bottom_tm_low_data, v_tm_offset + 35 * v_tm_step, v55_low);
+    i8buffer_st1(bottom_tm_high_data, v_tm_offset + 35 * v_tm_step, (v55 - v55_low) >> 8);
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp
new file mode 100644
index 000000000000..184b4da67d67
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output_int8.comp
@@ -0,0 +1,303 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(binding = 0) readonly buffer top_tm_blob { int top_tm_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 2) readonly buffer bias_blob { float bias_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { float weight_descales_data[]; };
+layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 5) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+    int outc;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= p.outc)
+        return;
+
+    // load 36
+    int v_tm_offset = gz * p.cstep + gy * p.block_x + gx;
+    int v_tm_step = p.cstep * p.outc;
+
+    int v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step];
+    int v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step];
+    int v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step];
+    int v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step];
+    int v04 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step];
+    int v05 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step];
+    int v10 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step];
+    int v11 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step];
+    int v12 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step];
+    int v13 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step];
+    int v14 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step];
+    int v15 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step];
+    int v20 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step];
+    int v21 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step];
+    int v22 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step];
+    int v23 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step];
+    int v24 = top_tm_blob_data[v_tm_offset + 16 * v_tm_step];
+    int v25 = top_tm_blob_data[v_tm_offset + 17 * v_tm_step];
+    int v30 = top_tm_blob_data[v_tm_offset + 18 * v_tm_step];
+    int v31 = top_tm_blob_data[v_tm_offset + 19 * v_tm_step];
+    int v32 = top_tm_blob_data[v_tm_offset + 20 * v_tm_step];
+    int v33 = top_tm_blob_data[v_tm_offset + 21 * v_tm_step];
+    int v34 = top_tm_blob_data[v_tm_offset + 22 * v_tm_step];
+    int v35 = top_tm_blob_data[v_tm_offset + 23 * v_tm_step];
+    int v40 = top_tm_blob_data[v_tm_offset + 24 * v_tm_step];
+    int v41 = top_tm_blob_data[v_tm_offset + 25 * v_tm_step];
+    int v42 = top_tm_blob_data[v_tm_offset + 26 * v_tm_step];
+    int v43 = top_tm_blob_data[v_tm_offset + 27 * v_tm_step];
+    int v44 = top_tm_blob_data[v_tm_offset + 28 * v_tm_step];
+    int v45 = top_tm_blob_data[v_tm_offset + 29 * v_tm_step];
+    int v50 = top_tm_blob_data[v_tm_offset + 30 * v_tm_step];
+    int v51 = top_tm_blob_data[v_tm_offset + 31 * v_tm_step];
+    int v52 = top_tm_blob_data[v_tm_offset + 32 * v_tm_step];
+    int v53 = top_tm_blob_data[v_tm_offset + 33 * v_tm_step];
+    int v54 = top_tm_blob_data[v_tm_offset + 34 * v_tm_step];
+    int v55 = top_tm_blob_data[v_tm_offset + 35 * v_tm_step];
+
+    // implicit transpose
+    int m00 = v00 + v01 + v02 + v03 + v04;
+    int m01 = v10 + v11 + v12 + v13 + v14;
+    int m02 = v20 + v21 + v22 + v23 + v24;
+    int m03 = v30 + v31 + v32 + v33 + v34;
+    int m04 = v40 + v41 + v42 + v43 + v44;
+    int m05 = (v50 + v51 + v52 + v53 + v54) * 4;
+
+    int m10 = (v01 - v02) + (v03 - v04) * 2;
+    int m11 = (v11 - v12) + (v13 - v14) * 2;
+    int m12 = (v21 - v22) + (v23 - v24) * 2;
+    int m13 = (v31 - v32) + (v33 - v34) * 2;
+    int m14 = (v41 - v42) + (v43 - v44) * 2;
+    int m15 = ((v51 - v52) + (v53 - v54) * 2) * 4;
+
+    int m20 = (v01 + v02) + (v03 + v04) * 4;
+    int m21 = (v11 + v12) + (v13 + v14) * 4;
+    int m22 = (v21 + v22) + (v23 + v24) * 4;
+    int m23 = (v31 + v32) + (v33 + v34) * 4;
+    int m24 = (v41 + v42) + (v43 + v44) * 4;
+    int m25 = ((v51 + v52) + (v53 + v54) * 4) * 4;
+
+    int m30 = (v01 - v02) + (v03 - v04) * 8 + v05 * 4;
+    int m31 = (v11 - v12) + (v13 - v14) * 8 + v15 * 4;
+    int m32 = (v21 - v22) + (v23 - v24) * 8 + v25 * 4;
+    int m33 = (v31 - v32) + (v33 - v34) * 8 + v35 * 4;
+    int m34 = (v41 - v42) + (v43 - v44) * 8 + v45 * 4;
+    int m35 = ((v51 - v52) + (v53 - v54) * 8 + v55 * 4) * 4;
+
+    v00 = m00 + m01 + m02 + m03 + m04;
+    v10 = m10 + m11 + m12 + m13 + m14;
+    v20 = m20 + m21 + m22 + m23 + m24;
+    v30 = m30 + m31 + m32 + m33 + m34;
+
+    v01 = (m01 - m02) + (m03 - m04) * 2;
+    v11 = (m11 - m12) + (m13 - m14) * 2;
+    v21 = (m21 - m22) + (m23 - m24) * 2;
+    v31 = (m31 - m32) + (m33 - m34) * 2;
+
+    v02 = (m01 + m02) + (m03 + m04) * 4;
+    v12 = (m11 + m12) + (m13 + m14) * 4;
+    v22 = (m21 + m22) + (m23 + m24) * 4;
+    v32 = (m31 + m32) + (m33 + m34) * 4;
+
+    v03 = (m01 - m02) + (m03 - m04) * 8 + m05;
+    v13 = (m11 - m12) + (m13 - m14) * 8 + m15;
+    v23 = (m21 - m22) + (m23 - m24) * 8 + m25;
+    v33 = (m31 - m32) + (m33 - m34) * 8 + m35;
+
+    const float descale = weight_descales_data[gz] * (1.f / 576.f);
+
+    float out00 = float(v00) * descale;
+    float out01 = float(v01) * descale;
+    float out02 = float(v02) * descale;
+    float out03 = float(v03) * descale;
+    float out10 = float(v10) * descale;
+    float out11 = float(v11) * descale;
+    float out12 = float(v12) * descale;
+    float out13 = float(v13) * descale;
+    float out20 = float(v20) * descale;
+    float out21 = float(v21) * descale;
+    float out22 = float(v22) * descale;
+    float out23 = float(v23) * descale;
+    float out30 = float(v30) * descale;
+    float out31 = float(v31) * descale;
+    float out32 = float(v32) * descale;
+    float out33 = float(v33) * descale;
+
+    if (bias_term == 1)
+    {
+        const float bias_value = bias_data[gz];
+
+        out00 += bias_value;
+        out01 += bias_value;
+        out02 += bias_value;
+        out03 += bias_value;
+        out10 += bias_value;
+        out11 += bias_value;
+        out12 += bias_value;
+        out13 += bias_value;
+        out20 += bias_value;
+        out21 += bias_value;
+        out22 += bias_value;
+        out23 += bias_value;
+        out30 += bias_value;
+        out31 += bias_value;
+        out32 += bias_value;
+        out33 += bias_value;
+    }
+
+    out00 = float(activation_afp(afp(out00), activation_type, activation_param_0, activation_param_1));
+    out01 = float(activation_afp(afp(out01), activation_type, activation_param_0, activation_param_1));
+    out02 = float(activation_afp(afp(out02), activation_type, activation_param_0, activation_param_1));
+    out03 = float(activation_afp(afp(out03), activation_type, activation_param_0, activation_param_1));
+    out10 = float(activation_afp(afp(out10), activation_type, activation_param_0, activation_param_1));
+    out11 = float(activation_afp(afp(out11), activation_type, activation_param_0, activation_param_1));
+    out12 = float(activation_afp(afp(out12), activation_type, activation_param_0, activation_param_1));
+    out13 = float(activation_afp(afp(out13), activation_type, activation_param_0, activation_param_1));
+    out20 = float(activation_afp(afp(out20), activation_type, activation_param_0, activation_param_1));
+    out21 = float(activation_afp(afp(out21), activation_type, activation_param_0, activation_param_1));
+    out22 = float(activation_afp(afp(out22), activation_type, activation_param_0, activation_param_1));
+    out23 = float(activation_afp(afp(out23), activation_type, activation_param_0, activation_param_1));
+    out30 = float(activation_afp(afp(out30), activation_type, activation_param_0, activation_param_1));
+    out31 = float(activation_afp(afp(out31), activation_type, activation_param_0, activation_param_1));
+    out32 = float(activation_afp(afp(out32), activation_type, activation_param_0, activation_param_1));
+    out33 = float(activation_afp(afp(out33), activation_type, activation_param_0, activation_param_1));
+
+    // store 4x4
+    int x = gx * 4;
+    int y = gy * 4;
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+
+        out00 *= top_scale;
+        out01 *= top_scale;
+        out02 *= top_scale;
+        out03 *= top_scale;
+        out10 *= top_scale;
+        out11 *= top_scale;
+        out12 *= top_scale;
+        out13 *= top_scale;
+        out20 *= top_scale;
+        out21 *= top_scale;
+        out22 *= top_scale;
+        out23 *= top_scale;
+        out30 *= top_scale;
+        out31 *= top_scale;
+        out32 *= top_scale;
+        out33 *= top_scale;
+
+        int out00_int8 = float2int8(out00);
+        int out01_int8 = float2int8(out01);
+        int out02_int8 = float2int8(out02);
+        int out03_int8 = float2int8(out03);
+        int out10_int8 = float2int8(out10);
+        int out11_int8 = float2int8(out11);
+        int out12_int8 = float2int8(out12);
+        int out13_int8 = float2int8(out13);
+        int out20_int8 = float2int8(out20);
+        int out21_int8 = float2int8(out21);
+        int out22_int8 = float2int8(out22);
+        int out23_int8 = float2int8(out23);
+        int out30_int8 = float2int8(out30);
+        int out31_int8 = float2int8(out31);
+        int out32_int8 = float2int8(out32);
+        int out33_int8 = float2int8(out33);
+
+        int v_offset0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset1 = v_offset0 + p.outw;
+        int v_offset2 = v_offset1 + p.outw;
+        int v_offset3 = v_offset2 + p.outw;
+
+        i8buffer_st1(top_blob_int8_data, v_offset0 + 0, out00_int8);
+        if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 1, out01_int8);
+        if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 2, out02_int8);
+        if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset0 + 3, out03_int8);
+
+        if (y + 1 < p.outh)
+        {
+            i8buffer_st1(top_blob_int8_data, v_offset1 + 0, out10_int8);
+            if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 1, out11_int8);
+            if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 2, out12_int8);
+            if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset1 + 3, out13_int8);
+        }
+
+        if (y + 2 < p.outh)
+        {
+            i8buffer_st1(top_blob_int8_data, v_offset2 + 0, out20_int8);
+            if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 1, out21_int8);
+            if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 2, out22_int8);
+            if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset2 + 3, out23_int8);
+        }
+
+        if (y + 3 < p.outh)
+        {
+            i8buffer_st1(top_blob_int8_data, v_offset3 + 0, out30_int8);
+            if (x + 1 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 1, out31_int8);
+            if (x + 2 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 2, out32_int8);
+            if (x + 3 < p.outw) i8buffer_st1(top_blob_int8_data, v_offset3 + 3, out33_int8);
+        }
+    }
+    else
+    {
+        int v_offset0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset1 = v_offset0 + p.outw;
+        int v_offset2 = v_offset1 + p.outw;
+        int v_offset3 = v_offset2 + p.outw;
+
+        buffer_st1(top_blob_data, v_offset0 + 0, afp(out00));
+        if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset0 + 1, afp(out01));
+        if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset0 + 2, afp(out02));
+        if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset0 + 3, afp(out03));
+
+        if (y + 1 < p.outh)
+        {
+            buffer_st1(top_blob_data, v_offset1 + 0, afp(out10));
+            if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset1 + 1, afp(out11));
+            if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset1 + 2, afp(out12));
+            if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset1 + 3, afp(out13));
+        }
+
+        if (y + 2 < p.outh)
+        {
+            buffer_st1(top_blob_data, v_offset2 + 0, afp(out20));
+            if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset2 + 1, afp(out21));
+            if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset2 + 2, afp(out22));
+            if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset2 + 3, afp(out23));
+        }
+
+        if (y + 3 < p.outh)
+        {
+            buffer_st1(top_blob_data, v_offset3 + 0, afp(out30));
+            if (x + 1 < p.outw) buffer_st1(top_blob_data, v_offset3 + 1, afp(out31));
+            if (x + 2 < p.outw) buffer_st1(top_blob_data, v_offset3 + 2, afp(out32));
+            if (x + 3 < p.outw) buffer_st1(top_blob_data, v_offset3 + 3, afp(out33));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp
new file mode 100644
index 000000000000..e083c947a2ae
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8.comp
@@ -0,0 +1,389 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_integerDotProduct16BitSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#define LOCAL_MEMORY_UNROLL_INCH  8
+#define LOCAL_MEMORY_UNROLL_INCH4 (LOCAL_MEMORY_UNROLL_INCH / 4)
+
+layout(constant_id = 0) const int batch = 1;
+layout(constant_id = 1) const int c = 0;
+layout(constant_id = 2) const int outc = 0;
+layout(constant_id = 3) const int elempack = 1;
+layout(constant_id = 4) const int out_elempack = 1;
+
+#define shape_constant_id_offset 5
+layout(constant_id = shape_constant_id_offset + 0) const int cstep = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_tm_blob { sint16 bottom_tm_blob_data[]; };
+layout(binding = 1) readonly buffer bottom_tm_blob_4 { sint16vec4 bottom_tm_blob_data_4[]; };
+layout(binding = 2) writeonly buffer top_tm_blob { int top_tm_blob_data[]; };
+layout(binding = 3) writeonly buffer top_tm_blob_4 { ivec4 top_tm_blob_data_4[]; };
+layout(binding = 4) readonly buffer weight_tm_blob { sint16vec4 weight_tm_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+    int outw;
+    int outcstep;
+} p;
+
+#if NCNN_shader_local_memory
+shared lint16vec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH4][4];
+shared lint16vec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH4][4];
+#endif
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    const int c4 = (c + 3) / 4;
+    const int outc4 = (outc + 3) / 4;
+    const int outc_aligned = outc4 * 4;
+
+#if !NCNN_shader_local_memory
+    if (gx >= psc(outw) || gy * 4 >= outc || gz >= batch)
+        return;
+#endif
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+
+#if NCNN_shader_local_memory
+    const int lx = int(gl_LocalInvocationID.x);
+    const int ly = int(gl_LocalInvocationID.y);
+
+    int q4 = 0;
+    for (; q4 + (LOCAL_MEMORY_UNROLL_INCH4 - 1) < c4; q4 += LOCAL_MEMORY_UNROLL_INCH4)
+    {
+        if (ly < 4)
+        {
+            const int pos = gx + ly;
+            if (pos < psc(outw))
+            {
+                for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++)
+                {
+                    if (elempack == 4)
+                    {
+                        tmp_v[lx][z4][ly] = i16buffer_sm4(bottom_tm_blob_data_4, gz * c4 * psc(cstep) + (q4 + z4) * psc(cstep) + pos);
+                    }
+                    else
+                    {
+                        const ivec4 q = min((q4 + z4) * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1));
+                        const int v_offset = gz * c * psc(cstep) + pos;
+                        i16buffer_st4(tmp_v[lx][z4], ly, ivec4(i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep))));
+                    }
+                }
+            }
+            else
+            {
+                for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++)
+                {
+                    i16buffer_st4(tmp_v[lx][z4], ly, ivec4(0));
+                }
+            }
+        }
+
+        if (lx < 4)
+        {
+            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++)
+            {
+                if (gy * 4 + lx < outc_aligned)
+                    tmp_k[ly][z4][lx] = i16buffer_sm4(weight_tm_data, gz * outc_aligned * c4 + (gy * 4 + lx) * c4 + q4 + z4);
+                else
+                    i16buffer_st4(tmp_k[ly][z4], lx, ivec4(0));
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH4; z4++)
+        {
+            const aint16vec4 v0 = lint162aint16vec4(tmp_v[lx][z4][0]);
+            const aint16vec4 v1 = lint162aint16vec4(tmp_v[lx][z4][1]);
+            const aint16vec4 v2 = lint162aint16vec4(tmp_v[lx][z4][2]);
+            const aint16vec4 v3 = lint162aint16vec4(tmp_v[lx][z4][3]);
+
+            const aint16vec4 k0 = lint162aint16vec4(tmp_k[ly][z4][0]);
+            const aint16vec4 k1 = lint162aint16vec4(tmp_k[ly][z4][1]);
+            const aint16vec4 k2 = lint162aint16vec4(tmp_k[ly][z4][2]);
+            const aint16vec4 k3 = lint162aint16vec4(tmp_k[ly][z4][3]);
+
+#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated
+            sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3));
+            sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3));
+            sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3));
+            sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3));
+#else
+            const ivec4 v0i = ivec4(v0);
+            const ivec4 v1i = ivec4(v1);
+            const ivec4 v2i = ivec4(v2);
+            const ivec4 v3i = ivec4(v3);
+            const ivec4 k0i = ivec4(k0);
+            const ivec4 k1i = ivec4(k1);
+            const ivec4 k2i = ivec4(k2);
+            const ivec4 k3i = ivec4(k3);
+
+            sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a;
+            sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a;
+            sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a;
+            sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a;
+
+            sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a;
+            sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a;
+            sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a;
+            sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a;
+
+            sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a;
+            sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a;
+            sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a;
+            sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a;
+
+            sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a;
+            sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a;
+            sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a;
+            sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a;
+#endif
+        }
+
+        barrier();
+    }
+
+    if (q4 < c4)
+    {
+        const int remain = c4 - q4;
+
+        if (ly < 4)
+        {
+            const int pos = gx + ly;
+            if (pos < psc(outw))
+            {
+                for (int z4 = 0; z4 < remain; z4++)
+                {
+                    if (elempack == 4)
+                    {
+                        tmp_v[lx][z4][ly] = i16buffer_sm4(bottom_tm_blob_data_4, gz * c4 * psc(cstep) + (q4 + z4) * psc(cstep) + pos);
+                    }
+                    else
+                    {
+                        const ivec4 q = min((q4 + z4) * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1));
+                        const int v_offset = gz * c * psc(cstep) + pos;
+                        i16buffer_st4(tmp_v[lx][z4], ly, ivec4(i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep)), i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep))));
+                    }
+                }
+            }
+            else
+            {
+                for (int z4 = 0; z4 < remain; z4++)
+                {
+                    i16buffer_st4(tmp_v[lx][z4], ly, ivec4(0));
+                }
+            }
+        }
+
+        if (lx < 4)
+        {
+            for (int z4 = 0; z4 < remain; z4++)
+            {
+                if (gy * 4 + lx < outc_aligned)
+                    tmp_k[ly][z4][lx] = i16buffer_sm4(weight_tm_data, gz * outc_aligned * c4 + (gy * 4 + lx) * c4 + q4 + z4);
+                else
+                    i16buffer_st4(tmp_k[ly][z4], lx, ivec4(0));
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            const aint16vec4 v0 = lint162aint16vec4(tmp_v[lx][z4][0]);
+            const aint16vec4 v1 = lint162aint16vec4(tmp_v[lx][z4][1]);
+            const aint16vec4 v2 = lint162aint16vec4(tmp_v[lx][z4][2]);
+            const aint16vec4 v3 = lint162aint16vec4(tmp_v[lx][z4][3]);
+
+            const aint16vec4 k0 = lint162aint16vec4(tmp_k[ly][z4][0]);
+            const aint16vec4 k1 = lint162aint16vec4(tmp_k[ly][z4][1]);
+            const aint16vec4 k2 = lint162aint16vec4(tmp_k[ly][z4][2]);
+            const aint16vec4 k3 = lint162aint16vec4(tmp_k[ly][z4][3]);
+
+#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated
+            sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3));
+            sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3));
+            sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3));
+            sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3));
+#else
+            const ivec4 v0i = ivec4(v0);
+            const ivec4 v1i = ivec4(v1);
+            const ivec4 v2i = ivec4(v2);
+            const ivec4 v3i = ivec4(v3);
+            const ivec4 k0i = ivec4(k0);
+            const ivec4 k1i = ivec4(k1);
+            const ivec4 k2i = ivec4(k2);
+            const ivec4 k3i = ivec4(k3);
+
+            sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a;
+            sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a;
+            sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a;
+            sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a;
+
+            sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a;
+            sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a;
+            sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a;
+            sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a;
+
+            sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a;
+            sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a;
+            sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a;
+            sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a;
+
+            sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a;
+            sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a;
+            sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a;
+            sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a;
+#endif
+        }
+    }
+#else
+    for (int q4 = 0; q4 < c4; q4++)
+    {
+        aint16vec4 v0;
+        aint16vec4 v1;
+        aint16vec4 v2;
+        aint16vec4 v3;
+
+        if (elempack == 4)
+        {
+            const int v_offset = gz * c4 * psc(cstep) + q4 * psc(cstep) + gx;
+            v0 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 0);
+            v1 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 1);
+            v2 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 2);
+            v3 = i16buffer_ld4(bottom_tm_blob_data_4, v_offset + 3);
+        }
+        else
+        {
+            const ivec4 q = min(q4 * 4 + ivec4(0, 1, 2, 3), ivec4(c - 1));
+            const int v_offset = gz * c * psc(cstep) + gx;
+            v0 = aint16vec4(ivec4(
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 0),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 0),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 0),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 0)));
+            v1 = aint16vec4(ivec4(
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 1),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 1),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 1),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 1)));
+            v2 = aint16vec4(ivec4(
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 2),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 2),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 2),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 2)));
+            v3 = aint16vec4(ivec4(
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.r * psc(cstep) + 3),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.g * psc(cstep) + 3),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.b * psc(cstep) + 3),
+                i16buffer_ld1(bottom_tm_blob_data, v_offset + q.a * psc(cstep) + 3)));
+        }
+
+        const int w_offset = gz * outc_aligned * c4 + gy * 4 * c4 + q4;
+        const aint16vec4 k0 = i16buffer_ld4(weight_tm_data, w_offset + 0 * c4);
+        const aint16vec4 k1 = i16buffer_ld4(weight_tm_data, w_offset + 1 * c4);
+        const aint16vec4 k2 = i16buffer_ld4(weight_tm_data, w_offset + 2 * c4);
+        const aint16vec4 k3 = i16buffer_ld4(weight_tm_data, w_offset + 3 * c4);
+
+#if (NCNN_int16_storage || NCNN_int16_packed) && ncnn_shaderInt16 && ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct16BitSignedAccelerated
+        sum0 += ivec4(dotEXT(v0, k0), dotEXT(v0, k1), dotEXT(v0, k2), dotEXT(v0, k3));
+        sum1 += ivec4(dotEXT(v1, k0), dotEXT(v1, k1), dotEXT(v1, k2), dotEXT(v1, k3));
+        sum2 += ivec4(dotEXT(v2, k0), dotEXT(v2, k1), dotEXT(v2, k2), dotEXT(v2, k3));
+        sum3 += ivec4(dotEXT(v3, k0), dotEXT(v3, k1), dotEXT(v3, k2), dotEXT(v3, k3));
+#else
+        const ivec4 v0i = ivec4(v0);
+        const ivec4 v1i = ivec4(v1);
+        const ivec4 v2i = ivec4(v2);
+        const ivec4 v3i = ivec4(v3);
+        const ivec4 k0i = ivec4(k0);
+        const ivec4 k1i = ivec4(k1);
+        const ivec4 k2i = ivec4(k2);
+        const ivec4 k3i = ivec4(k3);
+
+        sum0.r += v0i.r * k0i.r + v0i.g * k0i.g + v0i.b * k0i.b + v0i.a * k0i.a;
+        sum0.g += v0i.r * k1i.r + v0i.g * k1i.g + v0i.b * k1i.b + v0i.a * k1i.a;
+        sum0.b += v0i.r * k2i.r + v0i.g * k2i.g + v0i.b * k2i.b + v0i.a * k2i.a;
+        sum0.a += v0i.r * k3i.r + v0i.g * k3i.g + v0i.b * k3i.b + v0i.a * k3i.a;
+
+        sum1.r += v1i.r * k0i.r + v1i.g * k0i.g + v1i.b * k0i.b + v1i.a * k0i.a;
+        sum1.g += v1i.r * k1i.r + v1i.g * k1i.g + v1i.b * k1i.b + v1i.a * k1i.a;
+        sum1.b += v1i.r * k2i.r + v1i.g * k2i.g + v1i.b * k2i.b + v1i.a * k2i.a;
+        sum1.a += v1i.r * k3i.r + v1i.g * k3i.g + v1i.b * k3i.b + v1i.a * k3i.a;
+
+        sum2.r += v2i.r * k0i.r + v2i.g * k0i.g + v2i.b * k0i.b + v2i.a * k0i.a;
+        sum2.g += v2i.r * k1i.r + v2i.g * k1i.g + v2i.b * k1i.b + v2i.a * k1i.a;
+        sum2.b += v2i.r * k2i.r + v2i.g * k2i.g + v2i.b * k2i.b + v2i.a * k2i.a;
+        sum2.a += v2i.r * k3i.r + v2i.g * k3i.g + v2i.b * k3i.b + v2i.a * k3i.a;
+
+        sum3.r += v3i.r * k0i.r + v3i.g * k0i.g + v3i.b * k0i.b + v3i.a * k0i.a;
+        sum3.g += v3i.r * k1i.r + v3i.g * k1i.g + v3i.b * k1i.b + v3i.a * k1i.a;
+        sum3.b += v3i.r * k2i.r + v3i.g * k2i.g + v3i.b * k2i.b + v3i.a * k2i.a;
+        sum3.a += v3i.r * k3i.r + v3i.g * k3i.g + v3i.b * k3i.b + v3i.a * k3i.a;
+#endif
+    }
+#endif
+
+#if NCNN_shader_local_memory
+    if (gx >= psc(outw) || gy * 4 >= outc || gz >= batch)
+        return;
+#endif
+
+    if (out_elempack == 4)
+    {
+        int gi = (gz * outc4 + gy) * psc(outcstep) + gx;
+
+        top_tm_blob_data_4[gi + 0] = sum0;
+        if (gx + 1 < psc(outw)) top_tm_blob_data_4[gi + 1] = sum1;
+        if (gx + 2 < psc(outw)) top_tm_blob_data_4[gi + 2] = sum2;
+        if (gx + 3 < psc(outw)) top_tm_blob_data_4[gi + 3] = sum3;
+    }
+    else
+    {
+        int gi = (gz * outc + gy * 4) * psc(outcstep) + gx;
+
+        top_tm_blob_data[gi + 0 * psc(outcstep) + 0] = sum0.r;
+        if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 0] = sum0.g;
+        if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 0] = sum0.b;
+        if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 0] = sum0.a;
+
+        if (gx + 1 < psc(outw))
+        {
+            top_tm_blob_data[gi + 0 * psc(outcstep) + 1] = sum1.r;
+            if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 1] = sum1.g;
+            if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 1] = sum1.b;
+            if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 1] = sum1.a;
+        }
+
+        if (gx + 2 < psc(outw))
+        {
+            top_tm_blob_data[gi + 0 * psc(outcstep) + 2] = sum2.r;
+            if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 2] = sum2.g;
+            if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 2] = sum2.b;
+            if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 2] = sum2.a;
+        }
+
+        if (gx + 3 < psc(outw))
+        {
+            top_tm_blob_data[gi + 0 * psc(outcstep) + 3] = sum3.r;
+            if (gy * 4 + 1 < outc) top_tm_blob_data[gi + 1 * psc(outcstep) + 3] = sum3.g;
+            if (gy * 4 + 2 < outc) top_tm_blob_data[gi + 2 * psc(outcstep) + 3] = sum3.b;
+            if (gy * 4 + 3 < outc) top_tm_blob_data[gi + 3 * psc(outcstep) + 3] = sum3.a;
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp
new file mode 100644
index 000000000000..cbef1cf91451
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm_int8_cm.comp
@@ -0,0 +1,731 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_basic : require
+
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#if ncnn_VK_KHR_cooperative_matrix
+#extension GL_KHR_cooperative_matrix : require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix : require
+#extension GL_NV_integer_cooperative_matrix : require
+#endif
+
+layout(constant_id = 0) const uint batch = 1;
+layout(constant_id = 1) const uint M = 1;
+layout(constant_id = 2) const uint N = 1;
+layout(constant_id = 3) const uint K = 1;
+layout(constant_id = 4) const uint UNROLL_SG_M = 2;
+layout(constant_id = 5) const uint UNROLL_SG_N = 2;
+layout(constant_id = 6) const uint UNROLL_SG_K = 2;
+layout(constant_id = 7) const uint UNROLL_WG_M = 2;
+layout(constant_id = 8) const uint UNROLL_WG_N = 2;
+layout(constant_id = 9) const uint subgroup_size = 32;
+layout(constant_id = 10) const uint inch = 1;
+layout(constant_id = 11) const uint outch = 1;
+layout(constant_id = 12) const uint elempack = 1;
+layout(constant_id = 13) const uint out_elempack = 1;
+layout(constant_id = 14) const uint wbstep = 0;
+
+#define shape_constant_id_offset 15
+layout(constant_id = shape_constant_id_offset + 0) const uint size = 0;
+layout(constant_id = shape_constant_id_offset + 1) const uint cstep = 0;
+layout(constant_id = shape_constant_id_offset + 2) const uint outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_tm_low_blob { sint8vec4 bottom_tm_low_data[]; };
+layout(binding = 1) readonly buffer bottom_tm_high_blob { sint8vec4 bottom_tm_high_data[]; };
+layout(binding = 2) readonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; };
+layout(binding = 3) writeonly buffer top_tm_blob { int top_tm_blob_data[]; };
+layout(binding = 4) writeonly buffer top_tm_blob_4 { ivec4 top_tm_blob_data_4[]; };
+layout(binding = 5) readonly buffer weight_tm_blob { ivec2 weight_tm_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    uint size;
+    uint cstep;
+    uint outcstep;
+} p;
+
+const uint Md4 = M / 4;
+const uint Nd4 = N / 4;
+const uint Kd4 = K / 4;
+
+#if ncnn_VK_KHR_cooperative_matrix
+#define PAD 1
+#elif ncnn_VK_NV_cooperative_matrix
+#define PAD 0
+#endif
+
+const uint Kd4p = Kd4 + PAD;
+const uint Nd4p = Nd4 + PAD;
+
+shared int tmp_v0[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p];
+shared int tmp_v1[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p];
+shared int tmp_k0[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p];
+shared int tmp_k1[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p];
+shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4];
+
+void main()
+{
+    const uint gz = gl_GlobalInvocationID.z;
+    if (gz >= batch)
+        return;
+
+    const uint wgi = gl_WorkGroupID.x;
+    const uint sgi = gl_SubgroupID;
+
+    const uint wgmm = (psc(size) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M);
+    const uint wgnn = (outch + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N);
+
+    const uint wgmi = wgi / wgnn;
+    const uint wgni = wgi % wgnn;
+
+    const uint sgmi = sgi / UNROLL_WG_N;
+    const uint sgni = sgi % UNROLL_WG_N;
+
+    if (wgmi >= wgmm)
+        return;
+
+    const uint kk = (inch + K - 1) / K;
+    const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+    const uint si = gl_SubgroupInvocationID;
+
+    const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M;
+    const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N;
+
+#if ncnn_VK_KHR_cooperative_matrix
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum[UNROLL_SG_N][UNROLL_SG_M];
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum8[UNROLL_SG_N][UNROLL_SG_M];
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum16[UNROLL_SG_N][UNROLL_SG_M];
+#elif ncnn_VK_NV_cooperative_matrix
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M];
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum8[UNROLL_SG_N][UNROLL_SG_M];
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum16[UNROLL_SG_N][UNROLL_SG_M];
+#endif
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+#if ncnn_VK_KHR_cooperative_matrix
+            sum[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+            sum8[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+            sum16[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+#elif ncnn_VK_NV_cooperative_matrix
+            sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+            sum8[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+            sum16[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+#endif
+        }
+    }
+
+    if (kk >= UNROLL_SG_K * 2)
+    {
+        // local stack and shared memory ping-pong
+
+        const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+        const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+        const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+        const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+        const uint V_USGM_USGK = elempack == 1 ? Md4_K_USGM_USGK : M_Kd4p_USGM_USGK;
+        const uint V_USGM_USGK_d_subgroupsize = (V_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+        const uint K_Nd4p = K * Nd4p;
+        const uint K_Nd4p_USGN = K_Nd4p * UNROLL_SG_N;
+        const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+        const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+
+        ivec2 prefetch_tmp_v[(M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)];
+        ivec2 prefetch_tmp_k[(K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)];
+
+        // prefetch the very first
+        {
+            const uint ki = 0;
+
+            if (elempack == 1)
+            {
+                const uint cstepd4 = psc(cstep) / 4;
+
+                [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                    {
+                        const uint zk = siq / Md4_K_USGM;
+                        const uint zmij = siq % Md4_K_USGM;
+                        const uint zm = zmij / (Md4 * K);
+                        const uint ij = zmij % (Md4 * K);
+                        const uint i = ij / Md4;
+                        const uint j = ij % Md4;
+
+                        const uint gk = (ki + zk) * K + i;
+                        const uint gm = (mi + zm) * Md4 + j;
+
+                        const uint gi = (gz * inch + gk) * cstepd4 + gm;
+                        prefetch_tmp_v[q] = ivec2(i8buffer_sm4(bottom_tm_low_data, gi), i8buffer_sm4(bottom_tm_high_data, gi));
+                    }
+                }
+            }
+            else // if (elempack == 4)
+            {
+                const uint inchd4 = inch / 4;
+                const uint cstepd2 = psc(cstep) / 2;
+
+                [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                    {
+                        const uint zk = siq / M_Kd4p_USGM;
+                        const uint zmij = siq % M_Kd4p_USGM;
+                        const uint zm = zmij / (M * Kd4p);
+                        const uint ij = zmij % (M * Kd4p);
+                        const uint i = ij / Kd4p;
+                        const uint j = min(ij % Kd4p, Kd4 - 1);
+
+                        const uint gm = (mi + zm) * M + i;
+                        const uint gk = (ki + zk) * Kd4 + j;
+
+                        const uint gi = (gz * inchd4 + gk) * cstepd2 + gm;
+                        prefetch_tmp_v[q] = bottom_tm_low_high_data[gi];
+                    }
+                }
+            }
+        }
+        {
+            const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK;
+
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    prefetch_tmp_k[q] = weight_tm_data[w_offset + siq];
+                }
+            }
+        }
+
+        uint k = 0;
+        for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K)
+        {
+            // copy prefetched tile to shared memory
+            {
+                [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                    {
+                        tmp_v0[sgmi][siq] = prefetch_tmp_v[q].x;
+                        tmp_v1[sgmi][siq] = prefetch_tmp_v[q].y;
+                    }
+                }
+            }
+            {
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        tmp_k0[sgni][siq] = prefetch_tmp_k[q].x;
+                        tmp_k1[sgni][siq] = prefetch_tmp_k[q].y;
+                    }
+                }
+            }
+
+            barrier();
+
+            // prefetch next tile
+            const uint ki = k + UNROLL_SG_K;
+            {
+                if (elempack == 1)
+                {
+                    const uint cstepd4 = psc(cstep) / 4;
+
+                    [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+
+                            const uint gi = (gz * inch + gk) * cstepd4 + gm;
+                            prefetch_tmp_v[q] = ivec2(i8buffer_sm4(bottom_tm_low_data, gi), i8buffer_sm4(bottom_tm_high_data, gi));
+                        }
+                    }
+                }
+                else // if (elempack == 4)
+                {
+                    const uint inchd4 = inch / 4;
+                    const uint cstepd2 = psc(cstep) / 2;
+
+                    [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                        {
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zm = zmij / (M * Kd4p);
+                            const uint ij = zmij % (M * Kd4p);
+                            const uint i = ij / Kd4p;
+                            const uint j = min(ij % Kd4p, Kd4 - 1);
+
+                            const uint gm = (mi + zm) * M + i;
+                            const uint gk = (ki + zk) * Kd4 + j;
+
+                            const uint gi = (gz * inchd4 + gk) * cstepd2 + gm;
+                            prefetch_tmp_v[q] = bottom_tm_low_high_data[gi];
+                        }
+                    }
+                }
+            }
+            {
+                const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        prefetch_tmp_k[q] = weight_tm_data[w_offset + siq];
+                    }
+                }
+            }
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A0[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A1[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B0[UNROLL_SG_N];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B1[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+                        coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+                        coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+                        coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+                        coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                    coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+                    coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                    coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false);
+                    coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]);
+                        sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]);
+                        sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+
+        // the last copy prefetch to shared memory
+        {
+            [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                {
+                    tmp_v0[sgmi][siq] = prefetch_tmp_v[q].x;
+                    tmp_v1[sgmi][siq] = prefetch_tmp_v[q].y;
+                }
+            }
+        }
+        {
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    tmp_k0[sgni][siq] = prefetch_tmp_k[q].x;
+                    tmp_k1[sgni][siq] = prefetch_tmp_k[q].y;
+                }
+            }
+        }
+
+        barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+        coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A0[UNROLL_SG_M];
+        coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A1[UNROLL_SG_M];
+        coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B0[UNROLL_SG_N];
+        coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B1[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+        icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M];
+        icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M];
+        icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N];
+        icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N];
+#endif
+
+        [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+                if (elempack == 1)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+                    coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+                    coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                }
+                else
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+                    coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+                    coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                }
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+                coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false);
+                coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false);
+#endif
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]);
+                    sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]);
+                    sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]);
+                    sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]);
+                    sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]);
+                    sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]);
+                    sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]);
+#endif
+                }
+            }
+        }
+    }
+    else
+    {
+        // no ping-pong version
+        for (uint k = 0; k < kk; k += UNROLL_SG_K)
+        {
+            {
+                const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                const uint V_USGM_USGK = elempack == 1 ? Md4_K_USGM_USGK : M_Kd4p_USGM_USGK;
+                const uint V_USGM_USGK_d_subgroupsize = (V_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < V_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (V_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < V_USGM_USGK)
+                    {
+                        if (elempack == 1)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (k + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+
+                            const uint cstepd4 = psc(cstep) / 4;
+                            const uint gi = (gz * inch + gk) * cstepd4 + gm;
+                            tmp_v0[sgmi][siq] = i8buffer_sm4(bottom_tm_low_data, gi);
+                            tmp_v1[sgmi][siq] = i8buffer_sm4(bottom_tm_high_data, gi);
+                        }
+                        else
+                        {
+                            const uint cstepd2 = psc(cstep) / 2;
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zm = zmij / (M * Kd4p);
+                            const uint ij = zmij % (M * Kd4p);
+                            const uint i = ij / Kd4p;
+                            const uint j = min(ij % Kd4p, Kd4 - 1);
+
+                            const uint gm = (mi + zm) * M + i;
+                            const uint gk = (k + zk) * Kd4 + j;
+
+                            const ivec2 v01 = bottom_tm_low_high_data[(gz * (inch / 4) + gk) * cstepd2 + gm];
+                            tmp_v0[sgmi][siq] = v01.x;
+                            tmp_v1[sgmi][siq] = v01.y;
+                        }
+                    }
+                }
+            }
+            {
+                const uint K_Nd4p = K * Nd4p;
+                const uint K_Nd4p_USGN = K_Nd4p * UNROLL_SG_N;
+                const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+                const uint w_offset = gz * wbstep + (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((k / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        const ivec2 k01 = weight_tm_data[w_offset + siq];
+                        tmp_k0[sgni][siq] = k01.x;
+                        tmp_k1[sgni][siq] = k01.y;
+                    }
+                }
+            }
+
+            barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A0[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A1[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B0[UNROLL_SG_N];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B1[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A0[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A1[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B0[UNROLL_SG_N];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B1[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+                        coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+                        coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+                        coopMatLoad(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A0[zm], tmp_v0[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+                        coopMatLoadNV(A1[zm], tmp_v1[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                    coopMatLoad(B0[zn], tmp_k0[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+                    coopMatLoad(B1[zn], tmp_k1[sgni], k_offset, Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    const uint k_offset = (zk * UNROLL_SG_N + zn) * (K * Nd4p);
+                    coopMatLoadNV(B0[zn], tmp_k0[sgni], k_offset, Nd4p, false);
+                    coopMatLoadNV(B1[zn], tmp_k1[sgni], k_offset, Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A0[zm], B0[zn], sum[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAdd(A1[zm], B0[zn], sum8[zn][zm]);
+                        sum16[zn][zm] = coopMatMulAdd(A1[zm], B1[zn], sum16[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAdd(A0[zm], B1[zn], sum8[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A0[zm], B0[zn], sum[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAddNV(A1[zm], B0[zn], sum8[zn][zm]);
+                        sum16[zn][zm] = coopMatMulAddNV(A1[zm], B1[zn], sum16[zn][zm]);
+                        sum8[zn][zm] = coopMatMulAddNV(A0[zm], B1[zn], sum8[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+    }
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+            sum[zn][zm] += sum8[zn][zm] * (1 << 8) + sum16[zn][zm] * (1 << 16);
+        }
+    }
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+            if (out_elempack == 1)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true);
+#endif
+            }
+            else
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false);
+#endif
+            }
+        }
+    }
+
+    barrier();
+
+    if (out_elempack == 1)
+    {
+        const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N;
+        const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN)
+            {
+                const uint zn = siq / (Md4 * N * UNROLL_SG_M);
+                const uint zmij = siq % (Md4 * N * UNROLL_SG_M);
+                const uint zm = zmij / (Md4 * N);
+                const uint ij = zmij % (Md4 * N);
+                const uint i = ij / Md4;
+                const uint j = ij % Md4;
+
+                const uint gn = (ni + zn) * N + i;
+                const uint gm = (mi + zm) * Md4 + j;
+
+                if (gn < outch && gm * 4 < psc(size))
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    const uint gi = (gz * outch + gn) * psc(outcstep) + gm * 4;
+
+                    top_tm_blob_data[gi + 0] = sumi.r;
+                    if (gm * 4 + 1 < psc(size)) top_tm_blob_data[gi + 1] = sumi.g;
+                    if (gm * 4 + 2 < psc(size)) top_tm_blob_data[gi + 2] = sumi.b;
+                    if (gm * 4 + 3 < psc(size)) top_tm_blob_data[gi + 3] = sumi.a;
+                }
+            }
+        }
+    }
+    else
+    {
+        const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N;
+        const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN)
+            {
+                const uint zn = siq / (M * Nd4 * UNROLL_SG_M);
+                const uint zmij = siq % (M * Nd4 * UNROLL_SG_M);
+                const uint zm = zmij / (M * Nd4);
+                const uint ij = zmij % (M * Nd4);
+                const uint i = ij / Nd4;
+                const uint j = ij % Nd4;
+
+                const uint gm = (mi + zm) * M + i;
+                const uint gn = (ni + zn) * N + j * 4;
+
+                if (gm < psc(size) && gn < outch)
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    top_tm_blob_data_4[(gz * (outch / 4) + gn / 4) * psc(outcstep) + gm] = sumi;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp b/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp
new file mode 100644
index 000000000000..f52d1fdb784f
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_gemm_int8_cm.comp
@@ -0,0 +1,890 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_basic : require
+
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#if ncnn_VK_KHR_cooperative_matrix
+#extension GL_KHR_cooperative_matrix : require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix : require
+#extension GL_NV_integer_cooperative_matrix : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const uint kernel_w = 1;
+layout(constant_id = 1) const uint kernel_h = 1;
+layout(constant_id = 2) const uint dilation_w = 1;
+layout(constant_id = 3) const uint dilation_h = 1;
+layout(constant_id = 4) const uint stride_w = 1;
+layout(constant_id = 5) const uint stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int activation_type = 0;
+layout(constant_id = 8) const float activation_param_0 = 0;
+layout(constant_id = 9) const float activation_param_1 = 0;
+layout(constant_id = 10) const int use_int8_requantize = 0;
+layout(constant_id = 11) const uint elempack = 1;
+layout(constant_id = 12) const uint out_elempack = 1;
+
+#define shape_constant_id_offset 13
+layout(constant_id = shape_constant_id_offset + 0) const uint w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const uint h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const uint cstep = 0;
+layout(constant_id = shape_constant_id_offset + 3) const uint outw = 0;
+layout(constant_id = shape_constant_id_offset + 4) const uint outh = 0;
+layout(constant_id = shape_constant_id_offset + 5) const uint outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 6) const uint num_output = 0;
+layout(constant_id = shape_constant_id_offset + 7) const uint num_input = 0;
+
+layout(constant_id = shape_constant_id_offset + 8 + 0) const uint M = 1;
+layout(constant_id = shape_constant_id_offset + 8 + 1) const uint N = 1;
+layout(constant_id = shape_constant_id_offset + 8 + 2) const uint K = 1;
+layout(constant_id = shape_constant_id_offset + 8 + 3) const uint subgroup_size = 32;
+layout(constant_id = shape_constant_id_offset + 8 + 4) const uint UNROLL_SG_M = 2;
+layout(constant_id = shape_constant_id_offset + 8 + 5) const uint UNROLL_SG_N = 2;
+layout(constant_id = shape_constant_id_offset + 8 + 6) const uint UNROLL_SG_K = 2;
+layout(constant_id = shape_constant_id_offset + 8 + 7) const uint UNROLL_WG_M = 2;
+layout(constant_id = shape_constant_id_offset + 8 + 8) const uint UNROLL_WG_N = 2;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+layout(binding = 3) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 4) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 5) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 6) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    uint w;
+    uint h;
+    uint cstep;
+    uint outw;
+    uint outh;
+    uint outcstep;
+    uint num_output;
+    uint num_input;
+} p;
+
+const uint Nd4 = N / 4;
+const uint Kd4 = K / 4;
+const uint Md4 = M / 4;
+
+#if ncnn_VK_KHR_cooperative_matrix
+#define PAD 1
+#elif ncnn_VK_NV_cooperative_matrix
+#define PAD 0
+#endif
+
+const uint Nd4p = Nd4 + PAD;
+const uint Kd4p = Kd4 + PAD;
+
+shared int tmp_v[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p];
+shared int tmp_k[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p];
+shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * N / 4];
+
+void main()
+{
+    const uint maxk = kernel_w * kernel_h;
+    const uint size = psc(outw) * psc(outh);
+    const uint K0 = psc(num_input) * maxk;
+    const uint K04 = psc(num_input) / 4 * maxk;
+
+    const uint wgi = gl_WorkGroupID.x;
+    const uint sgi = gl_SubgroupID;
+
+    const uint wgmm = (size + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M);
+    const uint wgnn = (psc(num_output) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N);
+
+    const uint wgmi = wgi / wgnn;
+    const uint wgni = wgi % wgnn;
+
+    const uint sgmi = sgi / UNROLL_WG_N;
+    const uint sgni = sgi % UNROLL_WG_N;
+
+    const uint kk = (K0 + K - 1) / K;
+    const uint kk_padded = (kk + UNROLL_SG_K - 1) / UNROLL_SG_K * UNROLL_SG_K;
+
+    if (wgmi >= wgmm)
+        return;
+
+    const uint si = gl_SubgroupInvocationID;
+
+    const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M;
+    const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N;
+
+#if ncnn_VK_KHR_cooperative_matrix
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum[UNROLL_SG_N][UNROLL_SG_M];
+#elif ncnn_VK_NV_cooperative_matrix
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M];
+#endif
+
+    {
+        [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                sum[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+#elif ncnn_VK_NV_cooperative_matrix
+                sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+#endif
+            }
+        }
+    }
+
+    uint k = 0;
+
+    if (kk >= UNROLL_SG_K * 2)
+    {
+        // local stack and shared memory ping-pong
+
+        // prefetch
+        int prefetch_tmp_v[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4p + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)];
+        int prefetch_tmp_k[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4p + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)];
+
+        // prefetch the very first
+        {
+            if (elempack == 1)
+            {
+                const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                    {
+                        const uint zk = siq / Md4_K_USGM;
+                        const uint zmij = siq % Md4_K_USGM;
+                        const uint zm = zmij / (Md4 * K);
+                        const uint ij = zmij % (Md4 * K);
+                        const uint i = ij / Md4;
+                        const uint j = ij % Md4;
+
+                        const uint gk = zk * K + i;
+                        const uint gm = (mi + zm) * Md4 + j;
+                        const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3);
+
+                        ivec4 v4 = ivec4(0);
+                        if (gk < K0)
+                        {
+                            const uint sz = gk / maxk;
+                            const uint k = gk - sz * maxk;
+                            const uint ky = k / kernel_w;
+                            const uint kx = k - ky * kernel_w;
+
+                            const uvec4 sy = gm4 / psc(outw);
+                            const uvec4 sx = gm4 - sy * psc(outw);
+                            const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                            if (gm4.r < size)
+                            {
+                                const uint gi = sz * psc(cstep) + spatial.r;
+                                v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                            }
+                            if (gm4.g < size)
+                            {
+                                const uint gi = sz * psc(cstep) + spatial.g;
+                                v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                            }
+                            if (gm4.b < size)
+                            {
+                                const uint gi = sz * psc(cstep) + spatial.b;
+                                v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                            }
+                            if (gm4.a < size)
+                            {
+                                const uint gi = sz * psc(cstep) + spatial.a;
+                                v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                            }
+                        }
+
+                        prefetch_tmp_v[q] = packInt4x8(v4);
+                    }
+                }
+            }
+            else // elempack == 4
+            {
+                const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                    {
+                        const uint zk = siq / M_Kd4p_USGM;
+                        const uint zmij = siq % M_Kd4p_USGM;
+                        const uint zmi = zmij / Kd4p;
+                        const uint j = zmij % Kd4p;
+
+                        const uint gm = mi * M + zmi;
+                        const uint gk = zk * Kd4 + j;
+
+                        int v = 0;
+                        if (gm < size && gk < K04)
+                        {
+                            const uint sx = gm % psc(outw);
+                            const uint sy = gm / psc(outw);
+                            const uint sz = gk / maxk;
+                            const uint k = gk - sz * maxk;
+                            const uint ky = k / kernel_w;
+                            const uint kx = k - ky * kernel_w;
+                            const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                            v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial);
+                        }
+
+                        prefetch_tmp_v[q] = v;
+                    }
+                }
+            }
+        }
+        {
+            const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+            const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+            const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + sgni * K_Nd4p_USGN_USGK;
+            const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq);
+                }
+            }
+        }
+
+        for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K)
+        {
+            // copy prefetched tile to shared memory
+            {
+                if (elempack == 1)
+                {
+                    const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                        }
+                    }
+                }
+            }
+            {
+                const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        tmp_k[sgni][siq] = prefetch_tmp_k[q];
+                    }
+                }
+            }
+
+            barrier();
+
+            // prefetch next tile
+            const uint ki = k + UNROLL_SG_K;
+            {
+                if (elempack == 1)
+                {
+                    const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                    const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+                            const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3);
+
+                            ivec4 v4 = ivec4(0);
+                            if (gk < K0)
+                            {
+                                const uint sz = gk / maxk;
+                                const uint k = gk - sz * maxk;
+                                const uint ky = k / kernel_w;
+                                const uint kx = k - ky * kernel_w;
+
+                                const uvec4 sy = gm4 / psc(outw);
+                                const uvec4 sx = gm4 - sy * psc(outw);
+                                const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                                if (gm4.r < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.r;
+                                    v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.g < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.g;
+                                    v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.b < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.b;
+                                    v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.a < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.a;
+                                    v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                            }
+
+                            prefetch_tmp_v[q] = packInt4x8(v4);
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                    const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zmi = zmij / Kd4p;
+                            const uint j = zmij % Kd4p;
+
+                            const uint gm = mi * M + zmi;
+                            const uint gk = (ki + zk) * Kd4 + j;
+
+                            int v = 0;
+                            if (gm < size && gk < K04)
+                            {
+                                const uint sx = gm % psc(outw);
+                                const uint sy = gm / psc(outw);
+                                const uint sz = gk / maxk;
+                                const uint k = gk - sz * maxk;
+                                const uint ky = k / kernel_w;
+                                const uint kx = k - ky * kernel_w;
+                                const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                                v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial);
+                            }
+
+                            prefetch_tmp_v[q] = v;
+                        }
+                    }
+                }
+            }
+            {
+                const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+                const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        prefetch_tmp_k[q] = i8buffer_sm4(weight_data, w_offset + siq);
+                    }
+                }
+            }
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else // elempack == 4
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+
+        // copy and compute the last prefetched tile
+        {
+            if (elempack == 1)
+            {
+                const uint Md4_K_USGM_USGK = Md4 * K * UNROLL_SG_M * UNROLL_SG_K;
+                const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                    {
+                        tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                    }
+                }
+            }
+            else // elempack == 4
+            {
+                const uint M_Kd4p_USGM_USGK = M * Kd4p * UNROLL_SG_M * UNROLL_SG_K;
+                const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                    {
+                        tmp_v[sgmi][siq] = prefetch_tmp_v[q];
+                    }
+                }
+            }
+        }
+        {
+            const uint K_Nd4p_USGN_USGK = K * Nd4p * UNROLL_SG_N * UNROLL_SG_K;
+            const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                {
+                    tmp_k[sgni][siq] = prefetch_tmp_k[q];
+                }
+            }
+        }
+
+        barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+        coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+        coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+        icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+        icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+        [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+                if (elempack == 1)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                }
+                else // elempack == 4
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                }
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                }
+            }
+        }
+
+        barrier();
+    }
+    else
+    {
+        for (uint ki = 0; ki < kk; ki += UNROLL_SG_K)
+        {
+            {
+                if (elempack == 1)
+                {
+                    const uint Md4_K_USGM = Md4 * K * UNROLL_SG_M;
+                    const uint Md4_K_USGM_USGK = Md4_K_USGM * UNROLL_SG_K;
+                    const uint Md4_K_USGM_USGK_d_subgroupsize = (Md4_K_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < Md4_K_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (Md4_K_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < Md4_K_USGM_USGK)
+                        {
+                            const uint zk = siq / Md4_K_USGM;
+                            const uint zmij = siq % Md4_K_USGM;
+                            const uint zm = zmij / (Md4 * K);
+                            const uint ij = zmij % (Md4 * K);
+                            const uint i = ij / Md4;
+                            const uint j = ij % Md4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gm = (mi + zm) * Md4 + j;
+                            const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3);
+
+                            ivec4 v4 = ivec4(0);
+                            if (gk < K0)
+                            {
+                                const uint sz = gk / maxk;
+                                const uint k = gk - sz * maxk;
+                                const uint ky = k / kernel_w;
+                                const uint kx = k - ky * kernel_w;
+
+                                const uvec4 sy = gm4 / psc(outw);
+                                const uvec4 sx = gm4 - sy * psc(outw);
+                                const uvec4 spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                                if (gm4.r < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.r;
+                                    v4.r = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.g < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.g;
+                                    v4.g = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.b < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.b;
+                                    v4.b = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                                if (gm4.a < size)
+                                {
+                                    const uint gi = sz * psc(cstep) + spatial.a;
+                                    v4.a = i8buffer_ld4(bottom_blob_int8_data, gi / 4)[gi % 4];
+                                }
+                            }
+
+                            tmp_v[sgmi][siq] = packInt4x8(v4);
+                        }
+                    }
+                }
+                else // elempack == 4
+                {
+                    const uint M_Kd4p_USGM = M * Kd4p * UNROLL_SG_M;
+                    const uint M_Kd4p_USGM_USGK = M_Kd4p_USGM * UNROLL_SG_K;
+                    const uint M_Kd4p_USGM_USGK_d_subgroupsize = (M_Kd4p_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                    [[unroll]] for (uint q = 0; q < M_Kd4p_USGM_USGK_d_subgroupsize; q++)
+                    {
+                        const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                        if (M_Kd4p_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4p_USGM_USGK)
+                        {
+                            const uint zk = siq / M_Kd4p_USGM;
+                            const uint zmij = siq % M_Kd4p_USGM;
+                            const uint zmi = zmij / Kd4p;
+                            const uint j = zmij % Kd4p;
+
+                            const uint gm = mi * M + zmi;
+                            const uint gk = (ki + zk) * Kd4 + j;
+
+                            int v = 0;
+                            if (gm < size && gk < K04)
+                            {
+                                const uint sx = gm % psc(outw);
+                                const uint sy = gm / psc(outw);
+                                const uint sz = gk / maxk;
+                                const uint k = gk - sz * maxk;
+                                const uint ky = k / kernel_w;
+                                const uint kx = k - ky * kernel_w;
+                                const uint spatial = (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
+
+                                v = i8buffer_sm4(bottom_blob_int8_data, sz * psc(cstep) + spatial);
+                            }
+
+                            tmp_v[sgmi][siq] = v;
+                        }
+                    }
+                }
+            }
+
+            {
+                const uint K_Nd4p_USGN = K * Nd4p * UNROLL_SG_N;
+                const uint K_Nd4p_USGN_USGK = K_Nd4p_USGN * UNROLL_SG_K;
+                const uint K_Nd4p_USGN_USGK_d_subgroupsize = (K_Nd4p_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                const uint w_offset = (wgni * kk_padded * UNROLL_WG_N) * K_Nd4p_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4p_USGN_USGK;
+                [[unroll]] for (uint q = 0; q < K_Nd4p_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4p_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4p_USGN_USGK)
+                    {
+                        tmp_k[sgni][siq] = i8buffer_sm4(weight_data, w_offset + siq);
+                    }
+                }
+            }
+
+            barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+                    if (elempack == 1)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (Md4 * K), Md4, true);
+#endif
+                    }
+                    else // elempack == 4
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        coopMatLoad(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                        coopMatLoadNV(A[zm], tmp_v[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                    }
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_k[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+    }
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+            if (out_elempack == 1)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true);
+#endif
+            }
+            else // out_elempack == 4
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false);
+#endif
+            }
+        }
+    }
+
+    barrier();
+
+    if (out_elempack == 1)
+    {
+        const uint outcstepd4 = psc(outcstep) / 4;
+        const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N;
+        const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN)
+            {
+                const uint zn = siq / (Md4 * N * UNROLL_SG_M);
+                const uint zmij = siq % (Md4 * N * UNROLL_SG_M);
+                const uint zm = zmij / (Md4 * N);
+                const uint ij = zmij % (Md4 * N);
+                const uint i = ij / Md4;
+                const uint j = ij % Md4;
+
+                const uint gn = (ni + zn) * N + i;
+                const uint gm = (mi + zm) * Md4 + j;
+
+                if (gn < psc(num_output) && gm * 4 < size)
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    const int gn4 = int(gn % 4);
+                    vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4][gn4];
+
+                    if (bias_term == 1)
+                    {
+                        sumfp += bias_data[gn / 4][gn4];
+                    }
+
+                    sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1));
+
+                    if (use_int8_requantize == 1)
+                    {
+                        const float top_scale = float(buffer_ld1(top_scales_data, 0));
+                        sumfp *= top_scale;
+                        i8buffer_st4(top_blob_int8_data, gn * outcstepd4 + gm, float2int8vec4(sumfp));
+                    }
+                    else
+                    {
+                        buffer_st4(top_blob_data, gn * outcstepd4 + gm, afpvec4(sumfp));
+                    }
+                }
+            }
+        }
+    }
+    else // out_elempack == 4
+    {
+        const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N;
+        const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN)
+            {
+                const uint zn = siq / (M * Nd4 * UNROLL_SG_M);
+                const uint zmij = siq % (M * Nd4 * UNROLL_SG_M);
+                const uint zm = zmij / (M * Nd4);
+                const uint ij = zmij % (M * Nd4);
+                const uint i = ij / Nd4;
+                const uint j = ij % Nd4;
+
+                const uint gm = (mi + zm) * M + i;
+                const uint gn = (ni + zn) * N + j * 4;
+
+                if (gm < size && gn < psc(num_output))
+                {
+                    const ivec4 sumi = tmp_o[sgi][siq];
+                    vec4 sumfp = vec4(sumi) * weight_descales_data[gn / 4];
+
+                    if (bias_term == 1)
+                    {
+                        sumfp += bias_data[gn / 4];
+                    }
+
+                    sumfp = vec4(activation_afpvec4(afpvec4(sumfp), activation_type, activation_param_0, activation_param_1));
+
+                    if (use_int8_requantize == 1)
+                    {
+                        const float top_scale = float(buffer_ld1(top_scales_data, 0));
+                        sumfp *= top_scale;
+                        i8buffer_st4(top_blob_int8_data, (gn / 4) * psc(outcstep) + gm, float2int8vec4(sumfp));
+                    }
+                    else
+                    {
+                        buffer_st4(top_blob_data, (gn / 4) * psc(outcstep) + gm, afpvec4(sumfp));
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp
new file mode 100644
index 000000000000..6ac6d0563a18
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8.comp
@@ -0,0 +1,133 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_blob { sint16vec4 bottom_tm_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
+        return;
+
+    // load 4x4
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0);
+    ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0);
+    ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0);
+    ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0);
+
+    ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0);
+    ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0);
+    ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0);
+    ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0);
+
+    ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0);
+    ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0);
+    ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0);
+    ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0);
+
+    ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0);
+    ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0);
+    ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0);
+    ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0);
+
+    int tile = gy * psc(block_x) + gx;
+
+    ivec4 m00 = v00 - v02;
+    ivec4 m01 = v10 - v12;
+    ivec4 m02 = v20 - v22;
+    ivec4 m03 = v30 - v32;
+
+    ivec4 m10 = v01 + v02;
+    ivec4 m11 = v11 + v12;
+    ivec4 m12 = v21 + v22;
+    ivec4 m13 = v31 + v32;
+
+    ivec4 m20 = v02 - v01;
+    ivec4 m21 = v12 - v11;
+    ivec4 m22 = v22 - v21;
+    ivec4 m23 = v32 - v31;
+
+    ivec4 m30 = v03 - v01;
+    ivec4 m31 = v13 - v11;
+    ivec4 m32 = v23 - v21;
+    ivec4 m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m01 + m02;
+    v11 = m11 + m12;
+    v21 = m21 + m22;
+    v31 = m31 + m32;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+    int v_tm_offset = gz * psc(outcstep) + tile;
+    int v_tm_step = psc(outcstep) * psc(c);
+
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v10);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v11);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v12);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v13);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v20);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v21);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v22);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v23);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v30);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v31);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v32);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v33);
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp
new file mode 100644
index 000000000000..b756d7862912
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input_int8_cm.comp
@@ -0,0 +1,182 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
+        return;
+
+    // load 4x4
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0);
+    ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0);
+    ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0);
+    ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0);
+
+    ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0);
+    ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0);
+    ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0);
+    ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0);
+
+    ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0);
+    ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0);
+    ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0);
+    ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0);
+
+    ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0);
+    ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0);
+    ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0);
+    ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0);
+
+    int tile = gy * psc(block_x) + gx;
+
+    ivec4 m00 = v00 - v02;
+    ivec4 m01 = v10 - v12;
+    ivec4 m02 = v20 - v22;
+    ivec4 m03 = v30 - v32;
+
+    ivec4 m10 = v01 + v02;
+    ivec4 m11 = v11 + v12;
+    ivec4 m12 = v21 + v22;
+    ivec4 m13 = v31 + v32;
+
+    ivec4 m20 = v02 - v01;
+    ivec4 m21 = v12 - v11;
+    ivec4 m22 = v22 - v21;
+    ivec4 m23 = v32 - v31;
+
+    ivec4 m30 = v03 - v01;
+    ivec4 m31 = v13 - v11;
+    ivec4 m32 = v23 - v21;
+    ivec4 m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m01 + m02;
+    v11 = m11 + m12;
+    v21 = m21 + m22;
+    v31 = m31 + m32;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+    const int outcstepd2 = psc(outcstep) / 2;
+    int v_tm_offset = gz * outcstepd2 + tile;
+    int v_tm_step = outcstepd2 * psc(c);
+
+    ivec4 v00_low = v00 & ivec4(255);
+    v00_low = mix(v00_low, v00_low - ivec4(256), greaterThanEqual(v00_low, ivec4(128)));
+    ivec4 v00_high = (v00 - v00_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 0 * v_tm_step] = ivec2(packInt4x8(v00_low), packInt4x8(v00_high));
+    ivec4 v01_low = v01 & ivec4(255);
+    v01_low = mix(v01_low, v01_low - ivec4(256), greaterThanEqual(v01_low, ivec4(128)));
+    ivec4 v01_high = (v01 - v01_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 1 * v_tm_step] = ivec2(packInt4x8(v01_low), packInt4x8(v01_high));
+    ivec4 v02_low = v02 & ivec4(255);
+    v02_low = mix(v02_low, v02_low - ivec4(256), greaterThanEqual(v02_low, ivec4(128)));
+    ivec4 v02_high = (v02 - v02_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 2 * v_tm_step] = ivec2(packInt4x8(v02_low), packInt4x8(v02_high));
+    ivec4 v03_low = v03 & ivec4(255);
+    v03_low = mix(v03_low, v03_low - ivec4(256), greaterThanEqual(v03_low, ivec4(128)));
+    ivec4 v03_high = (v03 - v03_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 3 * v_tm_step] = ivec2(packInt4x8(v03_low), packInt4x8(v03_high));
+    ivec4 v10_low = v10 & ivec4(255);
+    v10_low = mix(v10_low, v10_low - ivec4(256), greaterThanEqual(v10_low, ivec4(128)));
+    ivec4 v10_high = (v10 - v10_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 4 * v_tm_step] = ivec2(packInt4x8(v10_low), packInt4x8(v10_high));
+    ivec4 v11_low = v11 & ivec4(255);
+    v11_low = mix(v11_low, v11_low - ivec4(256), greaterThanEqual(v11_low, ivec4(128)));
+    ivec4 v11_high = (v11 - v11_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 5 * v_tm_step] = ivec2(packInt4x8(v11_low), packInt4x8(v11_high));
+    ivec4 v12_low = v12 & ivec4(255);
+    v12_low = mix(v12_low, v12_low - ivec4(256), greaterThanEqual(v12_low, ivec4(128)));
+    ivec4 v12_high = (v12 - v12_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 6 * v_tm_step] = ivec2(packInt4x8(v12_low), packInt4x8(v12_high));
+    ivec4 v13_low = v13 & ivec4(255);
+    v13_low = mix(v13_low, v13_low - ivec4(256), greaterThanEqual(v13_low, ivec4(128)));
+    ivec4 v13_high = (v13 - v13_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 7 * v_tm_step] = ivec2(packInt4x8(v13_low), packInt4x8(v13_high));
+    ivec4 v20_low = v20 & ivec4(255);
+    v20_low = mix(v20_low, v20_low - ivec4(256), greaterThanEqual(v20_low, ivec4(128)));
+    ivec4 v20_high = (v20 - v20_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 8 * v_tm_step] = ivec2(packInt4x8(v20_low), packInt4x8(v20_high));
+    ivec4 v21_low = v21 & ivec4(255);
+    v21_low = mix(v21_low, v21_low - ivec4(256), greaterThanEqual(v21_low, ivec4(128)));
+    ivec4 v21_high = (v21 - v21_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 9 * v_tm_step] = ivec2(packInt4x8(v21_low), packInt4x8(v21_high));
+    ivec4 v22_low = v22 & ivec4(255);
+    v22_low = mix(v22_low, v22_low - ivec4(256), greaterThanEqual(v22_low, ivec4(128)));
+    ivec4 v22_high = (v22 - v22_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 10 * v_tm_step] = ivec2(packInt4x8(v22_low), packInt4x8(v22_high));
+    ivec4 v23_low = v23 & ivec4(255);
+    v23_low = mix(v23_low, v23_low - ivec4(256), greaterThanEqual(v23_low, ivec4(128)));
+    ivec4 v23_high = (v23 - v23_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 11 * v_tm_step] = ivec2(packInt4x8(v23_low), packInt4x8(v23_high));
+    ivec4 v30_low = v30 & ivec4(255);
+    v30_low = mix(v30_low, v30_low - ivec4(256), greaterThanEqual(v30_low, ivec4(128)));
+    ivec4 v30_high = (v30 - v30_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 12 * v_tm_step] = ivec2(packInt4x8(v30_low), packInt4x8(v30_high));
+    ivec4 v31_low = v31 & ivec4(255);
+    v31_low = mix(v31_low, v31_low - ivec4(256), greaterThanEqual(v31_low, ivec4(128)));
+    ivec4 v31_high = (v31 - v31_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 13 * v_tm_step] = ivec2(packInt4x8(v31_low), packInt4x8(v31_high));
+    ivec4 v32_low = v32 & ivec4(255);
+    v32_low = mix(v32_low, v32_low - ivec4(256), greaterThanEqual(v32_low, ivec4(128)));
+    ivec4 v32_high = (v32 - v32_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 14 * v_tm_step] = ivec2(packInt4x8(v32_low), packInt4x8(v32_high));
+    ivec4 v33_low = v33 & ivec4(255);
+    v33_low = mix(v33_low, v33_low - ivec4(256), greaterThanEqual(v33_low, ivec4(128)));
+    ivec4 v33_high = (v33 - v33_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 15 * v_tm_step] = ivec2(packInt4x8(v33_low), packInt4x8(v33_high));
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp
new file mode 100644
index 000000000000..3170fb28473c
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output_int8.comp
@@ -0,0 +1,142 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(binding = 0) readonly buffer top_tm_blob { ivec4 top_tm_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 5) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+    int outc;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    const int outc4 = (p.outc + 3) / 4;
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= outc4)
+        return;
+
+    // load 16
+    int v_tm_offset = gz * p.cstep + gy * p.block_x + gx;
+    int v_tm_step = p.cstep * outc4;
+
+    ivec4 v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step];
+    ivec4 v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step];
+    ivec4 v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step];
+    ivec4 v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step];
+    ivec4 v10 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step];
+    ivec4 v11 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step];
+    ivec4 v12 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step];
+    ivec4 v13 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step];
+    ivec4 v20 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step];
+    ivec4 v21 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step];
+    ivec4 v22 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step];
+    ivec4 v23 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step];
+    ivec4 v30 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step];
+    ivec4 v31 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step];
+    ivec4 v32 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step];
+    ivec4 v33 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step];
+
+    ivec4 m00 = v00 + v01 + v02;
+    ivec4 m01 = v10 + v11 + v12;
+    ivec4 m02 = v20 + v21 + v22;
+    ivec4 m03 = v30 + v31 + v32;
+
+    ivec4 m10 = v01 - v02 + v03;
+    ivec4 m11 = v11 - v12 + v13;
+    ivec4 m12 = v21 - v22 + v23;
+    ivec4 m13 = v31 - v32 + v33;
+
+    const vec4 descale = vec4(0.25f) * weight_descales_data[gz];
+
+    vec4 out00 = vec4(m00 + m01 + m02) * descale;
+    vec4 out01 = vec4(m01 - m02 + m03) * descale;
+    vec4 out10 = vec4(m10 + m11 + m12) * descale;
+    vec4 out11 = vec4(m11 - m12 + m13) * descale;
+
+    if (bias_term == 1)
+    {
+        const vec4 bias_value = bias_data[gz];
+
+        out00 += bias_value;
+        out01 += bias_value;
+        out10 += bias_value;
+        out11 += bias_value;
+    }
+
+    out00 = vec4(activation_afpvec4(afpvec4(out00), activation_type, activation_param_0, activation_param_1));
+    out01 = vec4(activation_afpvec4(afpvec4(out01), activation_type, activation_param_0, activation_param_1));
+    out10 = vec4(activation_afpvec4(afpvec4(out10), activation_type, activation_param_0, activation_param_1));
+    out11 = vec4(activation_afpvec4(afpvec4(out11), activation_type, activation_param_0, activation_param_1));
+
+    // store 2x2
+    int x = gx * 2;
+    int y = gy * 2;
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+
+        out00 *= top_scale;
+        out01 *= top_scale;
+        out10 *= top_scale;
+        out11 *= top_scale;
+
+        ivec4 out00_int8 = float2int8vec4(out00);
+        ivec4 out01_int8 = float2int8vec4(out01);
+        ivec4 out10_int8 = float2int8vec4(out10);
+        ivec4 out11_int8 = float2int8vec4(out11);
+
+        int v_offset_0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset_1 = v_offset_0 + p.outw;
+
+        i8buffer_st4(top_blob_int8_data, v_offset_0 + 0, out00_int8);
+        if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset_0 + 1, out01_int8);
+
+        if (y + 1 < p.outh)
+        {
+            i8buffer_st4(top_blob_int8_data, v_offset_1 + 0, out10_int8);
+            if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset_1 + 1, out11_int8);
+        }
+    }
+    else
+    {
+        int v_offset_0 = gz * p.outcstep + y * p.outw + x;
+        int v_offset_1 = v_offset_0 + p.outw;
+
+        buffer_st4(top_blob_data, v_offset_0 + 0, afpvec4(out00));
+        if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset_0 + 1, afpvec4(out01));
+
+        if (y + 1 < p.outh)
+        {
+            buffer_st4(top_blob_data, v_offset_1 + 0, afpvec4(out10));
+            if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset_1 + 1, afpvec4(out11));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp
new file mode 100644
index 000000000000..191427040895
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8.comp
@@ -0,0 +1,220 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_blob { sint16vec4 bottom_tm_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
+        return;
+
+    // load 6x6
+    int sx = gx * 4;
+    int sy = gy * 4;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+    ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w);
+
+    ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0);
+    ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0);
+    ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0);
+    ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0);
+    ivec4 v04 = sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 4) : ivec4(0);
+    ivec4 v05 = sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 5) : ivec4(0);
+
+    ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0);
+    ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0);
+    ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0);
+    ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0);
+    ivec4 v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 4) : ivec4(0);
+    ivec4 v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 5) : ivec4(0);
+
+    ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0);
+    ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0);
+    ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0);
+    ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0);
+    ivec4 v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 4) : ivec4(0);
+    ivec4 v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 5) : ivec4(0);
+
+    ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0);
+    ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0);
+    ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0);
+    ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0);
+    ivec4 v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 4) : ivec4(0);
+    ivec4 v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 5) : ivec4(0);
+
+    ivec4 v40 = sy + 4 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 0) : ivec4(0);
+    ivec4 v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 1) : ivec4(0);
+    ivec4 v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 2) : ivec4(0);
+    ivec4 v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 3) : ivec4(0);
+    ivec4 v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 4) : ivec4(0);
+    ivec4 v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 5) : ivec4(0);
+
+    ivec4 v50 = sy + 5 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 0) : ivec4(0);
+    ivec4 v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 1) : ivec4(0);
+    ivec4 v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 2) : ivec4(0);
+    ivec4 v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 3) : ivec4(0);
+    ivec4 v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 4) : ivec4(0);
+    ivec4 v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 5) : ivec4(0);
+
+    int tile = gy * psc(block_x) + gx;
+
+    ivec4 m00 = v04 + v00 * 4 - v02 * 5;
+    ivec4 m01 = v14 + v10 * 4 - v12 * 5;
+    ivec4 m02 = v24 + v20 * 4 - v22 * 5;
+    ivec4 m03 = v34 + v30 * 4 - v32 * 5;
+    ivec4 m04 = v44 + v40 * 4 - v42 * 5;
+    ivec4 m05 = v54 + v50 * 4 - v52 * 5;
+
+    ivec4 m10 = (v04 - v02 * 4) + (v03 - v01 * 4);
+    ivec4 m11 = (v14 - v12 * 4) + (v13 - v11 * 4);
+    ivec4 m12 = (v24 - v22 * 4) + (v23 - v21 * 4);
+    ivec4 m13 = (v34 - v32 * 4) + (v33 - v31 * 4);
+    ivec4 m14 = (v44 - v42 * 4) + (v43 - v41 * 4);
+    ivec4 m15 = (v54 - v52 * 4) + (v53 - v51 * 4);
+
+    ivec4 m20 = (v04 - v02 * 4) - (v03 - v01 * 4);
+    ivec4 m21 = (v14 - v12 * 4) - (v13 - v11 * 4);
+    ivec4 m22 = (v24 - v22 * 4) - (v23 - v21 * 4);
+    ivec4 m23 = (v34 - v32 * 4) - (v33 - v31 * 4);
+    ivec4 m24 = (v44 - v42 * 4) - (v43 - v41 * 4);
+    ivec4 m25 = (v54 - v52 * 4) - (v53 - v51 * 4);
+
+    ivec4 m30 = (v04 - v02) + (v03 - v01) * 2;
+    ivec4 m31 = (v14 - v12) + (v13 - v11) * 2;
+    ivec4 m32 = (v24 - v22) + (v23 - v21) * 2;
+    ivec4 m33 = (v34 - v32) + (v33 - v31) * 2;
+    ivec4 m34 = (v44 - v42) + (v43 - v41) * 2;
+    ivec4 m35 = (v54 - v52) + (v53 - v51) * 2;
+
+    ivec4 m40 = (v04 - v02) - (v03 - v01) * 2;
+    ivec4 m41 = (v14 - v12) - (v13 - v11) * 2;
+    ivec4 m42 = (v24 - v22) - (v23 - v21) * 2;
+    ivec4 m43 = (v34 - v32) - (v33 - v31) * 2;
+    ivec4 m44 = (v44 - v42) - (v43 - v41) * 2;
+    ivec4 m45 = (v54 - v52) - (v53 - v51) * 2;
+
+    ivec4 m50 = v05 + v01 * 4 - v03 * 5;
+    ivec4 m51 = v15 + v11 * 4 - v13 * 5;
+    ivec4 m52 = v25 + v21 * 4 - v23 * 5;
+    ivec4 m53 = v35 + v31 * 4 - v33 * 5;
+    ivec4 m54 = v45 + v41 * 4 - v43 * 5;
+    ivec4 m55 = v55 + v51 * 4 - v53 * 5;
+
+    v00 = m04 + m00 * 4 - m02 * 5;
+    v10 = m14 + m10 * 4 - m12 * 5;
+    v20 = m24 + m20 * 4 - m22 * 5;
+    v30 = m34 + m30 * 4 - m32 * 5;
+    v40 = m44 + m40 * 4 - m42 * 5;
+    v50 = m54 + m50 * 4 - m52 * 5;
+
+    v01 = (m04 - m02 * 4) + (m03 - m01 * 4);
+    v11 = (m14 - m12 * 4) + (m13 - m11 * 4);
+    v21 = (m24 - m22 * 4) + (m23 - m21 * 4);
+    v31 = (m34 - m32 * 4) + (m33 - m31 * 4);
+    v41 = (m44 - m42 * 4) + (m43 - m41 * 4);
+    v51 = (m54 - m52 * 4) + (m53 - m51 * 4);
+
+    v02 = (m04 - m02 * 4) - (m03 - m01 * 4);
+    v12 = (m14 - m12 * 4) - (m13 - m11 * 4);
+    v22 = (m24 - m22 * 4) - (m23 - m21 * 4);
+    v32 = (m34 - m32 * 4) - (m33 - m31 * 4);
+    v42 = (m44 - m42 * 4) - (m43 - m41 * 4);
+    v52 = (m54 - m52 * 4) - (m53 - m51 * 4);
+
+    v03 = (m04 - m02) + (m03 - m01) * 2;
+    v13 = (m14 - m12) + (m13 - m11) * 2;
+    v23 = (m24 - m22) + (m23 - m21) * 2;
+    v33 = (m34 - m32) + (m33 - m31) * 2;
+    v43 = (m44 - m42) + (m43 - m41) * 2;
+    v53 = (m54 - m52) + (m53 - m51) * 2;
+
+    v04 = (m04 - m02) - (m03 - m01) * 2;
+    v14 = (m14 - m12) - (m13 - m11) * 2;
+    v24 = (m24 - m22) - (m23 - m21) * 2;
+    v34 = (m34 - m32) - (m33 - m31) * 2;
+    v44 = (m44 - m42) - (m43 - m41) * 2;
+    v54 = (m54 - m52) - (m53 - m51) * 2;
+
+    v05 = m05 + m01 * 4 - m03 * 5;
+    v15 = m15 + m11 * 4 - m13 * 5;
+    v25 = m25 + m21 * 4 - m23 * 5;
+    v35 = m35 + m31 * 4 - m33 * 5;
+    v45 = m45 + m41 * 4 - m43 * 5;
+    v55 = m55 + m51 * 4 - m53 * 5;
+
+    // store 36
+    int v_tm_offset = gz * psc(outcstep) + tile;
+    int v_tm_step = psc(outcstep) * psc(c);
+
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * v_tm_step, v00);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * v_tm_step, v01);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * v_tm_step, v02);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * v_tm_step, v03);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * v_tm_step, v04);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * v_tm_step, v05);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * v_tm_step, v10);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * v_tm_step, v11);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * v_tm_step, v12);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * v_tm_step, v13);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * v_tm_step, v14);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * v_tm_step, v15);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * v_tm_step, v20);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * v_tm_step, v21);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * v_tm_step, v22);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * v_tm_step, v23);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 16 * v_tm_step, v24);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 17 * v_tm_step, v25);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 18 * v_tm_step, v30);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 19 * v_tm_step, v31);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 20 * v_tm_step, v32);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 21 * v_tm_step, v33);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 22 * v_tm_step, v34);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 23 * v_tm_step, v35);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 24 * v_tm_step, v40);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 25 * v_tm_step, v41);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 26 * v_tm_step, v42);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 27 * v_tm_step, v43);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 28 * v_tm_step, v44);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 29 * v_tm_step, v45);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 30 * v_tm_step, v50);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 31 * v_tm_step, v51);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 32 * v_tm_step, v52);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 33 * v_tm_step, v53);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 34 * v_tm_step, v54);
+    i16buffer_st4(bottom_tm_blob_data, v_tm_offset + 35 * v_tm_step, v55);
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp
new file mode 100644
index 000000000000..d2a1d70895f4
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input_int8_cm.comp
@@ -0,0 +1,329 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int c = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 4) const int block_x = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int block_y = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer bottom_tm_low_high_blob { ivec2 bottom_tm_low_high_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+
+    int c;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
+        return;
+
+    // load 6x6
+    int sx = gx * 4;
+    int sy = gy * 4;
+
+    int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+    ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w);
+
+    ivec4 v00 = i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 0);
+    ivec4 v01 = sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 1) : ivec4(0);
+    ivec4 v02 = sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 2) : ivec4(0);
+    ivec4 v03 = sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 3) : ivec4(0);
+    ivec4 v04 = sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 4) : ivec4(0);
+    ivec4 v05 = sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.r + 5) : ivec4(0);
+
+    ivec4 v10 = sy + 1 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 0) : ivec4(0);
+    ivec4 v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 1) : ivec4(0);
+    ivec4 v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 2) : ivec4(0);
+    ivec4 v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 3) : ivec4(0);
+    ivec4 v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 4) : ivec4(0);
+    ivec4 v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.g + 5) : ivec4(0);
+
+    ivec4 v20 = sy + 2 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 0) : ivec4(0);
+    ivec4 v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 1) : ivec4(0);
+    ivec4 v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 2) : ivec4(0);
+    ivec4 v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 3) : ivec4(0);
+    ivec4 v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 4) : ivec4(0);
+    ivec4 v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.b + 5) : ivec4(0);
+
+    ivec4 v30 = sy + 3 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 0) : ivec4(0);
+    ivec4 v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 1) : ivec4(0);
+    ivec4 v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 2) : ivec4(0);
+    ivec4 v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 3) : ivec4(0);
+    ivec4 v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 4) : ivec4(0);
+    ivec4 v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset.a + 5) : ivec4(0);
+
+    ivec4 v40 = sy + 4 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 0) : ivec4(0);
+    ivec4 v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 1) : ivec4(0);
+    ivec4 v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 2) : ivec4(0);
+    ivec4 v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 3) : ivec4(0);
+    ivec4 v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 4) : ivec4(0);
+    ivec4 v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.x + 5) : ivec4(0);
+
+    ivec4 v50 = sy + 5 < psc(h) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 0) : ivec4(0);
+    ivec4 v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 1) : ivec4(0);
+    ivec4 v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 2) : ivec4(0);
+    ivec4 v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 3) : ivec4(0);
+    ivec4 v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 4) : ivec4(0);
+    ivec4 v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? i8buffer_ld4(bottom_blob_int8_data, v_offset45.y + 5) : ivec4(0);
+
+    int tile = gy * psc(block_x) + gx;
+
+    ivec4 m00 = v04 + v00 * 4 - v02 * 5;
+    ivec4 m01 = v14 + v10 * 4 - v12 * 5;
+    ivec4 m02 = v24 + v20 * 4 - v22 * 5;
+    ivec4 m03 = v34 + v30 * 4 - v32 * 5;
+    ivec4 m04 = v44 + v40 * 4 - v42 * 5;
+    ivec4 m05 = v54 + v50 * 4 - v52 * 5;
+
+    ivec4 m10 = (v04 - v02 * 4) + (v03 - v01 * 4);
+    ivec4 m11 = (v14 - v12 * 4) + (v13 - v11 * 4);
+    ivec4 m12 = (v24 - v22 * 4) + (v23 - v21 * 4);
+    ivec4 m13 = (v34 - v32 * 4) + (v33 - v31 * 4);
+    ivec4 m14 = (v44 - v42 * 4) + (v43 - v41 * 4);
+    ivec4 m15 = (v54 - v52 * 4) + (v53 - v51 * 4);
+
+    ivec4 m20 = (v04 - v02 * 4) - (v03 - v01 * 4);
+    ivec4 m21 = (v14 - v12 * 4) - (v13 - v11 * 4);
+    ivec4 m22 = (v24 - v22 * 4) - (v23 - v21 * 4);
+    ivec4 m23 = (v34 - v32 * 4) - (v33 - v31 * 4);
+    ivec4 m24 = (v44 - v42 * 4) - (v43 - v41 * 4);
+    ivec4 m25 = (v54 - v52 * 4) - (v53 - v51 * 4);
+
+    ivec4 m30 = (v04 - v02) + (v03 - v01) * 2;
+    ivec4 m31 = (v14 - v12) + (v13 - v11) * 2;
+    ivec4 m32 = (v24 - v22) + (v23 - v21) * 2;
+    ivec4 m33 = (v34 - v32) + (v33 - v31) * 2;
+    ivec4 m34 = (v44 - v42) + (v43 - v41) * 2;
+    ivec4 m35 = (v54 - v52) + (v53 - v51) * 2;
+
+    ivec4 m40 = (v04 - v02) - (v03 - v01) * 2;
+    ivec4 m41 = (v14 - v12) - (v13 - v11) * 2;
+    ivec4 m42 = (v24 - v22) - (v23 - v21) * 2;
+    ivec4 m43 = (v34 - v32) - (v33 - v31) * 2;
+    ivec4 m44 = (v44 - v42) - (v43 - v41) * 2;
+    ivec4 m45 = (v54 - v52) - (v53 - v51) * 2;
+
+    ivec4 m50 = v05 + v01 * 4 - v03 * 5;
+    ivec4 m51 = v15 + v11 * 4 - v13 * 5;
+    ivec4 m52 = v25 + v21 * 4 - v23 * 5;
+    ivec4 m53 = v35 + v31 * 4 - v33 * 5;
+    ivec4 m54 = v45 + v41 * 4 - v43 * 5;
+    ivec4 m55 = v55 + v51 * 4 - v53 * 5;
+
+    v00 = m04 + m00 * 4 - m02 * 5;
+    v10 = m14 + m10 * 4 - m12 * 5;
+    v20 = m24 + m20 * 4 - m22 * 5;
+    v30 = m34 + m30 * 4 - m32 * 5;
+    v40 = m44 + m40 * 4 - m42 * 5;
+    v50 = m54 + m50 * 4 - m52 * 5;
+
+    v01 = (m04 - m02 * 4) + (m03 - m01 * 4);
+    v11 = (m14 - m12 * 4) + (m13 - m11 * 4);
+    v21 = (m24 - m22 * 4) + (m23 - m21 * 4);
+    v31 = (m34 - m32 * 4) + (m33 - m31 * 4);
+    v41 = (m44 - m42 * 4) + (m43 - m41 * 4);
+    v51 = (m54 - m52 * 4) + (m53 - m51 * 4);
+
+    v02 = (m04 - m02 * 4) - (m03 - m01 * 4);
+    v12 = (m14 - m12 * 4) - (m13 - m11 * 4);
+    v22 = (m24 - m22 * 4) - (m23 - m21 * 4);
+    v32 = (m34 - m32 * 4) - (m33 - m31 * 4);
+    v42 = (m44 - m42 * 4) - (m43 - m41 * 4);
+    v52 = (m54 - m52 * 4) - (m53 - m51 * 4);
+
+    v03 = (m04 - m02) + (m03 - m01) * 2;
+    v13 = (m14 - m12) + (m13 - m11) * 2;
+    v23 = (m24 - m22) + (m23 - m21) * 2;
+    v33 = (m34 - m32) + (m33 - m31) * 2;
+    v43 = (m44 - m42) + (m43 - m41) * 2;
+    v53 = (m54 - m52) + (m53 - m51) * 2;
+
+    v04 = (m04 - m02) - (m03 - m01) * 2;
+    v14 = (m14 - m12) - (m13 - m11) * 2;
+    v24 = (m24 - m22) - (m23 - m21) * 2;
+    v34 = (m34 - m32) - (m33 - m31) * 2;
+    v44 = (m44 - m42) - (m43 - m41) * 2;
+    v54 = (m54 - m52) - (m53 - m51) * 2;
+
+    v05 = m05 + m01 * 4 - m03 * 5;
+    v15 = m15 + m11 * 4 - m13 * 5;
+    v25 = m25 + m21 * 4 - m23 * 5;
+    v35 = m35 + m31 * 4 - m33 * 5;
+    v45 = m45 + m41 * 4 - m43 * 5;
+    v55 = m55 + m51 * 4 - m53 * 5;
+
+    // store 36
+    const int outcstepd2 = psc(outcstep) / 2;
+    int v_tm_offset = gz * outcstepd2 + tile;
+    int v_tm_step = outcstepd2 * psc(c);
+
+    ivec4 v00_low = v00 & ivec4(255);
+    v00_low = mix(v00_low, v00_low - ivec4(256), greaterThanEqual(v00_low, ivec4(128)));
+    ivec4 v00_high = (v00 - v00_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 0 * v_tm_step] = ivec2(packInt4x8(v00_low), packInt4x8(v00_high));
+    ivec4 v01_low = v01 & ivec4(255);
+    v01_low = mix(v01_low, v01_low - ivec4(256), greaterThanEqual(v01_low, ivec4(128)));
+    ivec4 v01_high = (v01 - v01_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 1 * v_tm_step] = ivec2(packInt4x8(v01_low), packInt4x8(v01_high));
+    ivec4 v02_low = v02 & ivec4(255);
+    v02_low = mix(v02_low, v02_low - ivec4(256), greaterThanEqual(v02_low, ivec4(128)));
+    ivec4 v02_high = (v02 - v02_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 2 * v_tm_step] = ivec2(packInt4x8(v02_low), packInt4x8(v02_high));
+    ivec4 v03_low = v03 & ivec4(255);
+    v03_low = mix(v03_low, v03_low - ivec4(256), greaterThanEqual(v03_low, ivec4(128)));
+    ivec4 v03_high = (v03 - v03_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 3 * v_tm_step] = ivec2(packInt4x8(v03_low), packInt4x8(v03_high));
+    ivec4 v04_low = v04 & ivec4(255);
+    v04_low = mix(v04_low, v04_low - ivec4(256), greaterThanEqual(v04_low, ivec4(128)));
+    ivec4 v04_high = (v04 - v04_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 4 * v_tm_step] = ivec2(packInt4x8(v04_low), packInt4x8(v04_high));
+    ivec4 v05_low = v05 & ivec4(255);
+    v05_low = mix(v05_low, v05_low - ivec4(256), greaterThanEqual(v05_low, ivec4(128)));
+    ivec4 v05_high = (v05 - v05_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 5 * v_tm_step] = ivec2(packInt4x8(v05_low), packInt4x8(v05_high));
+    ivec4 v10_low = v10 & ivec4(255);
+    v10_low = mix(v10_low, v10_low - ivec4(256), greaterThanEqual(v10_low, ivec4(128)));
+    ivec4 v10_high = (v10 - v10_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 6 * v_tm_step] = ivec2(packInt4x8(v10_low), packInt4x8(v10_high));
+    ivec4 v11_low = v11 & ivec4(255);
+    v11_low = mix(v11_low, v11_low - ivec4(256), greaterThanEqual(v11_low, ivec4(128)));
+    ivec4 v11_high = (v11 - v11_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 7 * v_tm_step] = ivec2(packInt4x8(v11_low), packInt4x8(v11_high));
+    ivec4 v12_low = v12 & ivec4(255);
+    v12_low = mix(v12_low, v12_low - ivec4(256), greaterThanEqual(v12_low, ivec4(128)));
+    ivec4 v12_high = (v12 - v12_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 8 * v_tm_step] = ivec2(packInt4x8(v12_low), packInt4x8(v12_high));
+    ivec4 v13_low = v13 & ivec4(255);
+    v13_low = mix(v13_low, v13_low - ivec4(256), greaterThanEqual(v13_low, ivec4(128)));
+    ivec4 v13_high = (v13 - v13_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 9 * v_tm_step] = ivec2(packInt4x8(v13_low), packInt4x8(v13_high));
+    ivec4 v14_low = v14 & ivec4(255);
+    v14_low = mix(v14_low, v14_low - ivec4(256), greaterThanEqual(v14_low, ivec4(128)));
+    ivec4 v14_high = (v14 - v14_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 10 * v_tm_step] = ivec2(packInt4x8(v14_low), packInt4x8(v14_high));
+    ivec4 v15_low = v15 & ivec4(255);
+    v15_low = mix(v15_low, v15_low - ivec4(256), greaterThanEqual(v15_low, ivec4(128)));
+    ivec4 v15_high = (v15 - v15_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 11 * v_tm_step] = ivec2(packInt4x8(v15_low), packInt4x8(v15_high));
+    ivec4 v20_low = v20 & ivec4(255);
+    v20_low = mix(v20_low, v20_low - ivec4(256), greaterThanEqual(v20_low, ivec4(128)));
+    ivec4 v20_high = (v20 - v20_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 12 * v_tm_step] = ivec2(packInt4x8(v20_low), packInt4x8(v20_high));
+    ivec4 v21_low = v21 & ivec4(255);
+    v21_low = mix(v21_low, v21_low - ivec4(256), greaterThanEqual(v21_low, ivec4(128)));
+    ivec4 v21_high = (v21 - v21_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 13 * v_tm_step] = ivec2(packInt4x8(v21_low), packInt4x8(v21_high));
+    ivec4 v22_low = v22 & ivec4(255);
+    v22_low = mix(v22_low, v22_low - ivec4(256), greaterThanEqual(v22_low, ivec4(128)));
+    ivec4 v22_high = (v22 - v22_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 14 * v_tm_step] = ivec2(packInt4x8(v22_low), packInt4x8(v22_high));
+    ivec4 v23_low = v23 & ivec4(255);
+    v23_low = mix(v23_low, v23_low - ivec4(256), greaterThanEqual(v23_low, ivec4(128)));
+    ivec4 v23_high = (v23 - v23_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 15 * v_tm_step] = ivec2(packInt4x8(v23_low), packInt4x8(v23_high));
+    ivec4 v24_low = v24 & ivec4(255);
+    v24_low = mix(v24_low, v24_low - ivec4(256), greaterThanEqual(v24_low, ivec4(128)));
+    ivec4 v24_high = (v24 - v24_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 16 * v_tm_step] = ivec2(packInt4x8(v24_low), packInt4x8(v24_high));
+    ivec4 v25_low = v25 & ivec4(255);
+    v25_low = mix(v25_low, v25_low - ivec4(256), greaterThanEqual(v25_low, ivec4(128)));
+    ivec4 v25_high = (v25 - v25_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 17 * v_tm_step] = ivec2(packInt4x8(v25_low), packInt4x8(v25_high));
+    ivec4 v30_low = v30 & ivec4(255);
+    v30_low = mix(v30_low, v30_low - ivec4(256), greaterThanEqual(v30_low, ivec4(128)));
+    ivec4 v30_high = (v30 - v30_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 18 * v_tm_step] = ivec2(packInt4x8(v30_low), packInt4x8(v30_high));
+    ivec4 v31_low = v31 & ivec4(255);
+    v31_low = mix(v31_low, v31_low - ivec4(256), greaterThanEqual(v31_low, ivec4(128)));
+    ivec4 v31_high = (v31 - v31_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 19 * v_tm_step] = ivec2(packInt4x8(v31_low), packInt4x8(v31_high));
+    ivec4 v32_low = v32 & ivec4(255);
+    v32_low = mix(v32_low, v32_low - ivec4(256), greaterThanEqual(v32_low, ivec4(128)));
+    ivec4 v32_high = (v32 - v32_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 20 * v_tm_step] = ivec2(packInt4x8(v32_low), packInt4x8(v32_high));
+    ivec4 v33_low = v33 & ivec4(255);
+    v33_low = mix(v33_low, v33_low - ivec4(256), greaterThanEqual(v33_low, ivec4(128)));
+    ivec4 v33_high = (v33 - v33_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 21 * v_tm_step] = ivec2(packInt4x8(v33_low), packInt4x8(v33_high));
+    ivec4 v34_low = v34 & ivec4(255);
+    v34_low = mix(v34_low, v34_low - ivec4(256), greaterThanEqual(v34_low, ivec4(128)));
+    ivec4 v34_high = (v34 - v34_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 22 * v_tm_step] = ivec2(packInt4x8(v34_low), packInt4x8(v34_high));
+    ivec4 v35_low = v35 & ivec4(255);
+    v35_low = mix(v35_low, v35_low - ivec4(256), greaterThanEqual(v35_low, ivec4(128)));
+    ivec4 v35_high = (v35 - v35_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 23 * v_tm_step] = ivec2(packInt4x8(v35_low), packInt4x8(v35_high));
+    ivec4 v40_low = v40 & ivec4(255);
+    v40_low = mix(v40_low, v40_low - ivec4(256), greaterThanEqual(v40_low, ivec4(128)));
+    ivec4 v40_high = (v40 - v40_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 24 * v_tm_step] = ivec2(packInt4x8(v40_low), packInt4x8(v40_high));
+    ivec4 v41_low = v41 & ivec4(255);
+    v41_low = mix(v41_low, v41_low - ivec4(256), greaterThanEqual(v41_low, ivec4(128)));
+    ivec4 v41_high = (v41 - v41_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 25 * v_tm_step] = ivec2(packInt4x8(v41_low), packInt4x8(v41_high));
+    ivec4 v42_low = v42 & ivec4(255);
+    v42_low = mix(v42_low, v42_low - ivec4(256), greaterThanEqual(v42_low, ivec4(128)));
+    ivec4 v42_high = (v42 - v42_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 26 * v_tm_step] = ivec2(packInt4x8(v42_low), packInt4x8(v42_high));
+    ivec4 v43_low = v43 & ivec4(255);
+    v43_low = mix(v43_low, v43_low - ivec4(256), greaterThanEqual(v43_low, ivec4(128)));
+    ivec4 v43_high = (v43 - v43_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 27 * v_tm_step] = ivec2(packInt4x8(v43_low), packInt4x8(v43_high));
+    ivec4 v44_low = v44 & ivec4(255);
+    v44_low = mix(v44_low, v44_low - ivec4(256), greaterThanEqual(v44_low, ivec4(128)));
+    ivec4 v44_high = (v44 - v44_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 28 * v_tm_step] = ivec2(packInt4x8(v44_low), packInt4x8(v44_high));
+    ivec4 v45_low = v45 & ivec4(255);
+    v45_low = mix(v45_low, v45_low - ivec4(256), greaterThanEqual(v45_low, ivec4(128)));
+    ivec4 v45_high = (v45 - v45_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 29 * v_tm_step] = ivec2(packInt4x8(v45_low), packInt4x8(v45_high));
+    ivec4 v50_low = v50 & ivec4(255);
+    v50_low = mix(v50_low, v50_low - ivec4(256), greaterThanEqual(v50_low, ivec4(128)));
+    ivec4 v50_high = (v50 - v50_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 30 * v_tm_step] = ivec2(packInt4x8(v50_low), packInt4x8(v50_high));
+    ivec4 v51_low = v51 & ivec4(255);
+    v51_low = mix(v51_low, v51_low - ivec4(256), greaterThanEqual(v51_low, ivec4(128)));
+    ivec4 v51_high = (v51 - v51_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 31 * v_tm_step] = ivec2(packInt4x8(v51_low), packInt4x8(v51_high));
+    ivec4 v52_low = v52 & ivec4(255);
+    v52_low = mix(v52_low, v52_low - ivec4(256), greaterThanEqual(v52_low, ivec4(128)));
+    ivec4 v52_high = (v52 - v52_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 32 * v_tm_step] = ivec2(packInt4x8(v52_low), packInt4x8(v52_high));
+    ivec4 v53_low = v53 & ivec4(255);
+    v53_low = mix(v53_low, v53_low - ivec4(256), greaterThanEqual(v53_low, ivec4(128)));
+    ivec4 v53_high = (v53 - v53_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 33 * v_tm_step] = ivec2(packInt4x8(v53_low), packInt4x8(v53_high));
+    ivec4 v54_low = v54 & ivec4(255);
+    v54_low = mix(v54_low, v54_low - ivec4(256), greaterThanEqual(v54_low, ivec4(128)));
+    ivec4 v54_high = (v54 - v54_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 34 * v_tm_step] = ivec2(packInt4x8(v54_low), packInt4x8(v54_high));
+    ivec4 v55_low = v55 & ivec4(255);
+    v55_low = mix(v55_low, v55_low - ivec4(256), greaterThanEqual(v55_low, ivec4(128)));
+    ivec4 v55_high = (v55 - v55_low) >> 8;
+    bottom_tm_low_high_data[v_tm_offset + 35 * v_tm_step] = ivec2(packInt4x8(v55_low), packInt4x8(v55_high));
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp
new file mode 100644
index 000000000000..056ca559e3a9
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output_int8.comp
@@ -0,0 +1,299 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(binding = 0) readonly buffer top_tm_blob { ivec4 top_tm_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 4) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 5) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+    int outc;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    const int outc4 = (p.outc + 3) / 4;
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= outc4)
+        return;
+
+    // load 36
+    int v_tm_offset = gz * p.cstep + gy * p.block_x + gx;
+    int v_tm_step = p.cstep * outc4;
+
+    ivec4 v00 = top_tm_blob_data[v_tm_offset + 0 * v_tm_step];
+    ivec4 v01 = top_tm_blob_data[v_tm_offset + 1 * v_tm_step];
+    ivec4 v02 = top_tm_blob_data[v_tm_offset + 2 * v_tm_step];
+    ivec4 v03 = top_tm_blob_data[v_tm_offset + 3 * v_tm_step];
+    ivec4 v04 = top_tm_blob_data[v_tm_offset + 4 * v_tm_step];
+    ivec4 v05 = top_tm_blob_data[v_tm_offset + 5 * v_tm_step];
+    ivec4 v10 = top_tm_blob_data[v_tm_offset + 6 * v_tm_step];
+    ivec4 v11 = top_tm_blob_data[v_tm_offset + 7 * v_tm_step];
+    ivec4 v12 = top_tm_blob_data[v_tm_offset + 8 * v_tm_step];
+    ivec4 v13 = top_tm_blob_data[v_tm_offset + 9 * v_tm_step];
+    ivec4 v14 = top_tm_blob_data[v_tm_offset + 10 * v_tm_step];
+    ivec4 v15 = top_tm_blob_data[v_tm_offset + 11 * v_tm_step];
+    ivec4 v20 = top_tm_blob_data[v_tm_offset + 12 * v_tm_step];
+    ivec4 v21 = top_tm_blob_data[v_tm_offset + 13 * v_tm_step];
+    ivec4 v22 = top_tm_blob_data[v_tm_offset + 14 * v_tm_step];
+    ivec4 v23 = top_tm_blob_data[v_tm_offset + 15 * v_tm_step];
+    ivec4 v24 = top_tm_blob_data[v_tm_offset + 16 * v_tm_step];
+    ivec4 v25 = top_tm_blob_data[v_tm_offset + 17 * v_tm_step];
+    ivec4 v30 = top_tm_blob_data[v_tm_offset + 18 * v_tm_step];
+    ivec4 v31 = top_tm_blob_data[v_tm_offset + 19 * v_tm_step];
+    ivec4 v32 = top_tm_blob_data[v_tm_offset + 20 * v_tm_step];
+    ivec4 v33 = top_tm_blob_data[v_tm_offset + 21 * v_tm_step];
+    ivec4 v34 = top_tm_blob_data[v_tm_offset + 22 * v_tm_step];
+    ivec4 v35 = top_tm_blob_data[v_tm_offset + 23 * v_tm_step];
+    ivec4 v40 = top_tm_blob_data[v_tm_offset + 24 * v_tm_step];
+    ivec4 v41 = top_tm_blob_data[v_tm_offset + 25 * v_tm_step];
+    ivec4 v42 = top_tm_blob_data[v_tm_offset + 26 * v_tm_step];
+    ivec4 v43 = top_tm_blob_data[v_tm_offset + 27 * v_tm_step];
+    ivec4 v44 = top_tm_blob_data[v_tm_offset + 28 * v_tm_step];
+    ivec4 v45 = top_tm_blob_data[v_tm_offset + 29 * v_tm_step];
+    ivec4 v50 = top_tm_blob_data[v_tm_offset + 30 * v_tm_step];
+    ivec4 v51 = top_tm_blob_data[v_tm_offset + 31 * v_tm_step];
+    ivec4 v52 = top_tm_blob_data[v_tm_offset + 32 * v_tm_step];
+    ivec4 v53 = top_tm_blob_data[v_tm_offset + 33 * v_tm_step];
+    ivec4 v54 = top_tm_blob_data[v_tm_offset + 34 * v_tm_step];
+    ivec4 v55 = top_tm_blob_data[v_tm_offset + 35 * v_tm_step];
+
+    // implicit transpose
+    ivec4 m00 = v00 + v01 + v02 + v03 + v04;
+    ivec4 m01 = v10 + v11 + v12 + v13 + v14;
+    ivec4 m02 = v20 + v21 + v22 + v23 + v24;
+    ivec4 m03 = v30 + v31 + v32 + v33 + v34;
+    ivec4 m04 = v40 + v41 + v42 + v43 + v44;
+    ivec4 m05 = (v50 + v51 + v52 + v53 + v54) * 4;
+
+    ivec4 m10 = (v01 - v02) + (v03 - v04) * 2;
+    ivec4 m11 = (v11 - v12) + (v13 - v14) * 2;
+    ivec4 m12 = (v21 - v22) + (v23 - v24) * 2;
+    ivec4 m13 = (v31 - v32) + (v33 - v34) * 2;
+    ivec4 m14 = (v41 - v42) + (v43 - v44) * 2;
+    ivec4 m15 = ((v51 - v52) + (v53 - v54) * 2) * 4;
+
+    ivec4 m20 = (v01 + v02) + (v03 + v04) * 4;
+    ivec4 m21 = (v11 + v12) + (v13 + v14) * 4;
+    ivec4 m22 = (v21 + v22) + (v23 + v24) * 4;
+    ivec4 m23 = (v31 + v32) + (v33 + v34) * 4;
+    ivec4 m24 = (v41 + v42) + (v43 + v44) * 4;
+    ivec4 m25 = ((v51 + v52) + (v53 + v54) * 4) * 4;
+
+    ivec4 m30 = (v01 - v02) + (v03 - v04) * 8 + v05 * 4;
+    ivec4 m31 = (v11 - v12) + (v13 - v14) * 8 + v15 * 4;
+    ivec4 m32 = (v21 - v22) + (v23 - v24) * 8 + v25 * 4;
+    ivec4 m33 = (v31 - v32) + (v33 - v34) * 8 + v35 * 4;
+    ivec4 m34 = (v41 - v42) + (v43 - v44) * 8 + v45 * 4;
+    ivec4 m35 = ((v51 - v52) + (v53 - v54) * 8 + v55 * 4) * 4;
+
+    v00 = m00 + m01 + m02 + m03 + m04;
+    v10 = m10 + m11 + m12 + m13 + m14;
+    v20 = m20 + m21 + m22 + m23 + m24;
+    v30 = m30 + m31 + m32 + m33 + m34;
+
+    v01 = (m01 - m02) + (m03 - m04) * 2;
+    v11 = (m11 - m12) + (m13 - m14) * 2;
+    v21 = (m21 - m22) + (m23 - m24) * 2;
+    v31 = (m31 - m32) + (m33 - m34) * 2;
+
+    v02 = (m01 + m02) + (m03 + m04) * 4;
+    v12 = (m11 + m12) + (m13 + m14) * 4;
+    v22 = (m21 + m22) + (m23 + m24) * 4;
+    v32 = (m31 + m32) + (m33 + m34) * 4;
+
+    v03 = (m01 - m02) + (m03 - m04) * 8 + m05;
+    v13 = (m11 - m12) + (m13 - m14) * 8 + m15;
+    v23 = (m21 - m22) + (m23 - m24) * 8 + m25;
+    v33 = (m31 - m32) + (m33 - m34) * 8 + m35;
+
+    const vec4 descale = vec4(1.f / 576.f) * weight_descales_data[gz];
+
+    vec4 out00 = vec4(v00) * descale;
+    vec4 out01 = vec4(v01) * descale;
+    vec4 out02 = vec4(v02) * descale;
+    vec4 out03 = vec4(v03) * descale;
+    vec4 out10 = vec4(v10) * descale;
+    vec4 out11 = vec4(v11) * descale;
+    vec4 out12 = vec4(v12) * descale;
+    vec4 out13 = vec4(v13) * descale;
+    vec4 out20 = vec4(v20) * descale;
+    vec4 out21 = vec4(v21) * descale;
+    vec4 out22 = vec4(v22) * descale;
+    vec4 out23 = vec4(v23) * descale;
+    vec4 out30 = vec4(v30) * descale;
+    vec4 out31 = vec4(v31) * descale;
+    vec4 out32 = vec4(v32) * descale;
+    vec4 out33 = vec4(v33) * descale;
+
+    if (bias_term == 1)
+    {
+        const vec4 bias_value = bias_data[gz];
+
+        out00 += bias_value;
+        out01 += bias_value;
+        out02 += bias_value;
+        out03 += bias_value;
+        out10 += bias_value;
+        out11 += bias_value;
+        out12 += bias_value;
+        out13 += bias_value;
+        out20 += bias_value;
+        out21 += bias_value;
+        out22 += bias_value;
+        out23 += bias_value;
+        out30 += bias_value;
+        out31 += bias_value;
+        out32 += bias_value;
+        out33 += bias_value;
+    }
+
+    out00 = vec4(activation_afpvec4(afpvec4(out00), activation_type, activation_param_0, activation_param_1));
+    out01 = vec4(activation_afpvec4(afpvec4(out01), activation_type, activation_param_0, activation_param_1));
+    out02 = vec4(activation_afpvec4(afpvec4(out02), activation_type, activation_param_0, activation_param_1));
+    out03 = vec4(activation_afpvec4(afpvec4(out03), activation_type, activation_param_0, activation_param_1));
+    out10 = vec4(activation_afpvec4(afpvec4(out10), activation_type, activation_param_0, activation_param_1));
+    out11 = vec4(activation_afpvec4(afpvec4(out11), activation_type, activation_param_0, activation_param_1));
+    out12 = vec4(activation_afpvec4(afpvec4(out12), activation_type, activation_param_0, activation_param_1));
+    out13 = vec4(activation_afpvec4(afpvec4(out13), activation_type, activation_param_0, activation_param_1));
+    out20 = vec4(activation_afpvec4(afpvec4(out20), activation_type, activation_param_0, activation_param_1));
+    out21 = vec4(activation_afpvec4(afpvec4(out21), activation_type, activation_param_0, activation_param_1));
+    out22 = vec4(activation_afpvec4(afpvec4(out22), activation_type, activation_param_0, activation_param_1));
+    out23 = vec4(activation_afpvec4(afpvec4(out23), activation_type, activation_param_0, activation_param_1));
+    out30 = vec4(activation_afpvec4(afpvec4(out30), activation_type, activation_param_0, activation_param_1));
+    out31 = vec4(activation_afpvec4(afpvec4(out31), activation_type, activation_param_0, activation_param_1));
+    out32 = vec4(activation_afpvec4(afpvec4(out32), activation_type, activation_param_0, activation_param_1));
+    out33 = vec4(activation_afpvec4(afpvec4(out33), activation_type, activation_param_0, activation_param_1));
+
+    // store 4x4
+    int x = gx * 4;
+    int y = gy * 4;
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+
+        out00 *= top_scale;
+        out01 *= top_scale;
+        out02 *= top_scale;
+        out03 *= top_scale;
+        out10 *= top_scale;
+        out11 *= top_scale;
+        out12 *= top_scale;
+        out13 *= top_scale;
+        out20 *= top_scale;
+        out21 *= top_scale;
+        out22 *= top_scale;
+        out23 *= top_scale;
+        out30 *= top_scale;
+        out31 *= top_scale;
+        out32 *= top_scale;
+        out33 *= top_scale;
+
+        ivec4 out00_int8 = float2int8vec4(out00);
+        ivec4 out01_int8 = float2int8vec4(out01);
+        ivec4 out02_int8 = float2int8vec4(out02);
+        ivec4 out03_int8 = float2int8vec4(out03);
+        ivec4 out10_int8 = float2int8vec4(out10);
+        ivec4 out11_int8 = float2int8vec4(out11);
+        ivec4 out12_int8 = float2int8vec4(out12);
+        ivec4 out13_int8 = float2int8vec4(out13);
+        ivec4 out20_int8 = float2int8vec4(out20);
+        ivec4 out21_int8 = float2int8vec4(out21);
+        ivec4 out22_int8 = float2int8vec4(out22);
+        ivec4 out23_int8 = float2int8vec4(out23);
+        ivec4 out30_int8 = float2int8vec4(out30);
+        ivec4 out31_int8 = float2int8vec4(out31);
+        ivec4 out32_int8 = float2int8vec4(out32);
+        ivec4 out33_int8 = float2int8vec4(out33);
+
+        ivec4 v_offset = gz * p.outcstep + y * p.outw + x + ivec4(0, 1, 2, 3) * p.outw;
+
+        i8buffer_st4(top_blob_int8_data, v_offset.r + 0, out00_int8);
+        if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 1, out01_int8);
+        if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 2, out02_int8);
+        if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.r + 3, out03_int8);
+
+        if (y + 1 < p.outh)
+        {
+            i8buffer_st4(top_blob_int8_data, v_offset.g + 0, out10_int8);
+            if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 1, out11_int8);
+            if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 2, out12_int8);
+            if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.g + 3, out13_int8);
+        }
+
+        if (y + 2 < p.outh)
+        {
+            i8buffer_st4(top_blob_int8_data, v_offset.b + 0, out20_int8);
+            if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 1, out21_int8);
+            if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 2, out22_int8);
+            if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.b + 3, out23_int8);
+        }
+
+        if (y + 3 < p.outh)
+        {
+            i8buffer_st4(top_blob_int8_data, v_offset.a + 0, out30_int8);
+            if (x + 1 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 1, out31_int8);
+            if (x + 2 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 2, out32_int8);
+            if (x + 3 < p.outw) i8buffer_st4(top_blob_int8_data, v_offset.a + 3, out33_int8);
+        }
+    }
+    else
+    {
+        ivec4 v_offset = gz * p.outcstep + y * p.outw + x + ivec4(0, 1, 2, 3) * p.outw;
+
+        buffer_st4(top_blob_data, v_offset.r + 0, afpvec4(out00));
+        if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.r + 1, afpvec4(out01));
+        if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.r + 2, afpvec4(out02));
+        if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.r + 3, afpvec4(out03));
+
+        if (y + 1 < p.outh)
+        {
+            buffer_st4(top_blob_data, v_offset.g + 0, afpvec4(out10));
+            if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.g + 1, afpvec4(out11));
+            if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.g + 2, afpvec4(out12));
+            if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.g + 3, afpvec4(out13));
+        }
+
+        if (y + 2 < p.outh)
+        {
+            buffer_st4(top_blob_data, v_offset.b + 0, afpvec4(out20));
+            if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.b + 1, afpvec4(out21));
+            if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.b + 2, afpvec4(out22));
+            if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.b + 3, afpvec4(out23));
+        }
+
+        if (y + 3 < p.outh)
+        {
+            buffer_st4(top_blob_data, v_offset.a + 0, afpvec4(out30));
+            if (x + 1 < p.outw) buffer_st4(top_blob_data, v_offset.a + 1, afpvec4(out31));
+            if (x + 2 < p.outw) buffer_st4(top_blob_data, v_offset.a + 2, afpvec4(out32));
+            if (x + 3 < p.outw) buffer_st4(top_blob_data, v_offset.a + 3, afpvec4(out33));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp b/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp
new file mode 100644
index 000000000000..e3e49ed2dcb2
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_packed_1x1s1d1_int8.comp
@@ -0,0 +1,496 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+#define LOCAL_MEMORY_UNROLL_INCH 8
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int use_int8_requantize = 0;
+layout(constant_id = 5) const int elempack = 1;
+layout(constant_id = 6) const int out_elempack = 1;
+
+#define shape_constant_id_offset 7
+layout(constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int outcstep_native = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int size = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int num_output = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int num_input = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; };
+layout(binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 4) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 5) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 6) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+    int outc;
+    int outcstep;
+    int outcstep_native;
+    int size;
+} p;
+
+#if NCNN_shader_local_memory
+shared ivec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH];
+shared ivec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
+#endif
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int gy = int(gl_GlobalInvocationID.y);
+
+#if !NCNN_shader_local_memory
+    if (gx >= psc(size) || gy >= psc(outc))
+        return;
+#endif
+
+    const int base_pos = gx * 4;
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+
+    int w_offset = gy * psc(c) * 4;
+
+#if NCNN_shader_local_memory
+    const int lx = int(gl_LocalInvocationID.x);
+    const int ly = int(gl_LocalInvocationID.y);
+
+    int z = 0;
+    for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
+    {
+        if (ly < LOCAL_MEMORY_UNROLL_INCH)
+        {
+            const int zz = z + ly;
+
+            if (elempack == 4)
+            {
+                int v0p = 0;
+                int v1p = 0;
+                int v2p = 0;
+                int v3p = 0;
+
+                if (gx < psc(size))
+                {
+                    const int v_offset = base_pos + zz * psc(cstep);
+                    v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0);
+                    v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1);
+                    v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2);
+                    v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3);
+                }
+
+                tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p);
+            }
+            else // elempack == 1
+            {
+                const int ci = zz * 4;
+
+                int r0p = 0;
+                int r1p = 0;
+                int r2p = 0;
+                int r3p = 0;
+
+                if (gx < psc(size))
+                {
+                    r0p = i8buffer_sm4(bottom_blob_int8_data, gx + ci * psc(cstep));
+                    if (ci + 1 < num_input) r1p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 1) * psc(cstep));
+                    if (ci + 2 < num_input) r2p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 2) * psc(cstep));
+                    if (ci + 3 < num_input) r3p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 3) * psc(cstep));
+                }
+
+                const ivec4 r0 = unpackInt4x8(r0p);
+                const ivec4 r1 = unpackInt4x8(r1p);
+                const ivec4 r2 = unpackInt4x8(r2p);
+                const ivec4 r3 = unpackInt4x8(r3p);
+
+                tmp_v[lx][ly] = ivec4(packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r)),
+                                      packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g)),
+                                      packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b)),
+                                      packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a)));
+            }
+        }
+
+        if (lx < LOCAL_MEMORY_UNROLL_INCH)
+        {
+            tmp_k[ly][lx] = weight_data[w_offset / 4 + lx];
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
+        {
+            const ivec4 vp = tmp_v[lx][z4];
+            const int v0p = vp.r;
+            const int v1p = vp.g;
+            const int v2p = vp.b;
+            const int v3p = vp.a;
+
+            const ivec4 kp = tmp_k[ly][z4];
+            const int k0p = kp.r;
+            const int k1p = kp.g;
+            const int k2p = kp.b;
+            const int k3p = kp.a;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+            sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+            sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+            sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+            const ivec4 v0 = unpackInt4x8(v0p);
+            const ivec4 v1 = unpackInt4x8(v1p);
+            const ivec4 v2 = unpackInt4x8(v2p);
+            const ivec4 v3 = unpackInt4x8(v3p);
+
+            const ivec4 k0 = unpackInt4x8(k0p);
+            const ivec4 k1 = unpackInt4x8(k1p);
+            const ivec4 k2 = unpackInt4x8(k2p);
+            const ivec4 k3 = unpackInt4x8(k3p);
+
+            sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                          v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                          v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                          v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+            sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                          v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                          v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                          v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+            sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                          v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                          v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                          v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+            sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                          v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                          v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                          v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+        }
+
+        w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;
+
+        barrier();
+    }
+
+    if (z < psc(c))
+    {
+        const int remain = psc(c) - z;
+
+        if (ly < remain)
+        {
+            const int zz = z + ly;
+
+            if (elempack == 4)
+            {
+                int v0p = 0;
+                int v1p = 0;
+                int v2p = 0;
+                int v3p = 0;
+
+                if (gx < psc(size))
+                {
+                    const int v_offset = base_pos + zz * psc(cstep);
+                    v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0);
+                    v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1);
+                    v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2);
+                    v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3);
+                }
+
+                tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p);
+            }
+            else // elempack == 1
+            {
+                const int ci = zz * 4;
+
+                int r0p = 0;
+                int r1p = 0;
+                int r2p = 0;
+                int r3p = 0;
+
+                if (gx < psc(size))
+                {
+                    r0p = i8buffer_sm4(bottom_blob_int8_data, gx + ci * psc(cstep));
+                    if (ci + 1 < num_input) r1p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 1) * psc(cstep));
+                    if (ci + 2 < num_input) r2p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 2) * psc(cstep));
+                    if (ci + 3 < num_input) r3p = i8buffer_sm4(bottom_blob_int8_data, gx + (ci + 3) * psc(cstep));
+                }
+
+                const ivec4 r0 = unpackInt4x8(r0p);
+                const ivec4 r1 = unpackInt4x8(r1p);
+                const ivec4 r2 = unpackInt4x8(r2p);
+                const ivec4 r3 = unpackInt4x8(r3p);
+
+                tmp_v[lx][ly] = ivec4(packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r)),
+                                      packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g)),
+                                      packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b)),
+                                      packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a)));
+            }
+        }
+
+        if (lx < remain)
+        {
+            tmp_k[ly][lx] = weight_data[w_offset / 4 + lx];
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            const ivec4 vp = tmp_v[lx][z4];
+            const int v0p = vp.r;
+            const int v1p = vp.g;
+            const int v2p = vp.b;
+            const int v3p = vp.a;
+
+            const ivec4 kp = tmp_k[ly][z4];
+            const int k0p = kp.r;
+            const int k1p = kp.g;
+            const int k2p = kp.b;
+            const int k3p = kp.a;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+            sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+            sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+            sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+            const ivec4 v0 = unpackInt4x8(v0p);
+            const ivec4 v1 = unpackInt4x8(v1p);
+            const ivec4 v2 = unpackInt4x8(v2p);
+            const ivec4 v3 = unpackInt4x8(v3p);
+
+            const ivec4 k0 = unpackInt4x8(k0p);
+            const ivec4 k1 = unpackInt4x8(k1p);
+            const ivec4 k2 = unpackInt4x8(k2p);
+            const ivec4 k3 = unpackInt4x8(k3p);
+
+            sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                          v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                          v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                          v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+            sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                          v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                          v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                          v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+            sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                          v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                          v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                          v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+            sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                          v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                          v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                          v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+        }
+    }
+#else
+    for (int z = 0; z < psc(c); z++)
+    {
+        const ivec4 kp = weight_data[w_offset / 4];
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        int v0p;
+        int v1p;
+        int v2p;
+        int v3p;
+
+        if (elempack == 4)
+        {
+            const int v_offset = base_pos + z * psc(cstep);
+            v0p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 0);
+            v1p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 1);
+            v2p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 2);
+            v3p = i8buffer_sm4(bottom_blob_int8_data, v_offset + 3);
+        }
+        else // elempack == 1
+        {
+            const int v_offset = gx + z * 4 * psc(cstep);
+            const int ci = z * 4;
+            const ivec4 r0 = i8buffer_ld4(bottom_blob_int8_data, v_offset);
+            const ivec4 r1 = ci + 1 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep)) : ivec4(0);
+            const ivec4 r2 = ci + 2 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 2) : ivec4(0);
+            const ivec4 r3 = ci + 3 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 3) : ivec4(0);
+
+            v0p = packInt4x8(ivec4(r0.r, r1.r, r2.r, r3.r));
+            v1p = packInt4x8(ivec4(r0.g, r1.g, r2.g, r3.g));
+            v2p = packInt4x8(ivec4(r0.b, r1.b, r2.b, r3.b));
+            v3p = packInt4x8(ivec4(r0.a, r1.a, r2.a, r3.a));
+        }
+
+        const int k0p = kp.r;
+        const int k1p = kp.g;
+        const int k2p = kp.b;
+        const int k3p = kp.a;
+
+        sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+        sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+        sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+        sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+        ivec4 v0;
+        ivec4 v1;
+        ivec4 v2;
+        ivec4 v3;
+
+        if (elempack == 4)
+        {
+            const int v_offset = base_pos + z * psc(cstep);
+            v0 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 0);
+            v1 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 1);
+            v2 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 2);
+            v3 = i8buffer_ld4(bottom_blob_int8_data, v_offset + 3);
+        }
+        else // elempack == 1
+        {
+            const int v_offset = gx + z * 4 * psc(cstep);
+            const int ci = z * 4;
+            const ivec4 r0 = i8buffer_ld4(bottom_blob_int8_data, v_offset);
+            const ivec4 r1 = ci + 1 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep)) : ivec4(0);
+            const ivec4 r2 = ci + 2 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 2) : ivec4(0);
+            const ivec4 r3 = ci + 3 < num_input ? i8buffer_ld4(bottom_blob_int8_data, v_offset + psc(cstep) * 3) : ivec4(0);
+
+            v0 = ivec4(r0.r, r1.r, r2.r, r3.r);
+            v1 = ivec4(r0.g, r1.g, r2.g, r3.g);
+            v2 = ivec4(r0.b, r1.b, r2.b, r3.b);
+            v3 = ivec4(r0.a, r1.a, r2.a, r3.a);
+        }
+
+        const ivec4 k0 = unpackInt4x8(kp.r);
+        const ivec4 k1 = unpackInt4x8(kp.g);
+        const ivec4 k2 = unpackInt4x8(kp.b);
+        const ivec4 k3 = unpackInt4x8(kp.a);
+
+        sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                      v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                      v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                      v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+        sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                      v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                      v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                      v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+        sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                      v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                      v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                      v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+        sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                      v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                      v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                      v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+
+        w_offset += 4;
+    }
+#endif
+
+#if NCNN_shader_local_memory
+    if (gx >= psc(size) || gy >= psc(outc))
+        return;
+#endif
+
+    const int base_ch = gy * 4;
+    const int nout = num_output;
+    const vec4 descale = weight_descales_data[gy];
+
+    vec4 sumfp32_0 = vec4(sum0) * descale;
+    vec4 sumfp32_1 = vec4(sum1) * descale;
+    vec4 sumfp32_2 = vec4(sum2) * descale;
+    vec4 sumfp32_3 = vec4(sum3) * descale;
+
+    if (bias_term == 1)
+    {
+        const vec4 bias = bias_data[gy];
+
+        sumfp32_0 += bias;
+        sumfp32_1 += bias;
+        sumfp32_2 += bias;
+        sumfp32_3 += bias;
+    }
+
+    sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1));
+    sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1));
+    sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1));
+    sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1));
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+        sumfp32_0 *= top_scale;
+        sumfp32_1 *= top_scale;
+        sumfp32_2 *= top_scale;
+        sumfp32_3 *= top_scale;
+
+        const ivec4 v0 = float2int8vec4(sumfp32_0);
+        const ivec4 v1 = float2int8vec4(sumfp32_1);
+        const ivec4 v2 = float2int8vec4(sumfp32_2);
+        const ivec4 v3 = float2int8vec4(sumfp32_3);
+
+        if (out_elempack == 4)
+        {
+            const int gi = gy * psc(outcstep) + gx * 4;
+
+            i8buffer_st4(top_blob_int8_data, gi + 0, v0);
+            if (gx * 4 + 1 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 1, v1);
+            if (gx * 4 + 2 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 2, v2);
+            if (gx * 4 + 3 < psc(outcstep)) i8buffer_st4(top_blob_int8_data, gi + 3, v3);
+        }
+        else // out_elempack == 1
+        {
+            const ivec4 o0 = ivec4(v0.r, v1.r, v2.r, v3.r);
+            const ivec4 o1 = ivec4(v0.g, v1.g, v2.g, v3.g);
+            const ivec4 o2 = ivec4(v0.b, v1.b, v2.b, v3.b);
+            const ivec4 o3 = ivec4(v0.a, v1.a, v2.a, v3.a);
+
+            const int gi = base_ch * psc(outcstep_native) + gx;
+
+            i8buffer_st4(top_blob_int8_data, gi, o0);
+            if (base_ch + 1 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native), o1);
+            if (base_ch + 2 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native) * 2, o2);
+            if (base_ch + 3 < nout) i8buffer_st4(top_blob_int8_data, gi + psc(outcstep_native) * 3, o3);
+        }
+    }
+    else
+    {
+        if (out_elempack == 4)
+        {
+            const int gi = gy * psc(outcstep) + gx * 4;
+
+            buffer_st4(top_blob_data, gi + 0, afpvec4(sumfp32_0));
+            if (gx * 4 + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, afpvec4(sumfp32_1));
+            if (gx * 4 + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, afpvec4(sumfp32_2));
+            if (gx * 4 + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, afpvec4(sumfp32_3));
+        }
+        else // out_elempack == 1
+        {
+            const vec4 o0 = vec4(sumfp32_0.r, sumfp32_1.r, sumfp32_2.r, sumfp32_3.r);
+            const vec4 o1 = vec4(sumfp32_0.g, sumfp32_1.g, sumfp32_2.g, sumfp32_3.g);
+            const vec4 o2 = vec4(sumfp32_0.b, sumfp32_1.b, sumfp32_2.b, sumfp32_3.b);
+            const vec4 o3 = vec4(sumfp32_0.a, sumfp32_1.a, sumfp32_2.a, sumfp32_3.a);
+
+            const int gi = base_ch * psc(outcstep_native) + gx;
+
+            buffer_st4(top_blob_data, gi, afpvec4(o0));
+            if (base_ch + 1 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native), afpvec4(o1));
+            if (base_ch + 2 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native) * 2, afpvec4(o2));
+            if (base_ch + 3 < nout) buffer_st4(top_blob_data, gi + psc(outcstep_native) * 3, afpvec4(o3));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp b/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp
new file mode 100644
index 000000000000..cc45e90be915
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_packed_gemm_int8.comp
@@ -0,0 +1,608 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+#define LOCAL_MEMORY_UNROLL_INCH 8
+
+layout(constant_id = 0) const int kernel_w = 1;
+layout(constant_id = 1) const int kernel_h = 1;
+layout(constant_id = 2) const int dilation_w = 1;
+layout(constant_id = 3) const int dilation_h = 1;
+layout(constant_id = 4) const int stride_w = 1;
+layout(constant_id = 5) const int stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int activation_type = 0;
+layout(constant_id = 8) const float activation_param_0 = 0;
+layout(constant_id = 9) const float activation_param_1 = 0;
+layout(constant_id = 10) const int use_int8_requantize = 0;
+layout(constant_id = 11) const int elempack = 1;
+layout(constant_id = 12) const int out_elempack = 1;
+
+#define shape_constant_id_offset 13
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int num_output = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int num_input = 0;
+
+layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; };
+layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; };
+layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; };
+layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; };
+layout(binding = 4) readonly buffer weight_blob { ivec4 weight_data[]; };
+layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 6) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 7) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; };
+layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#if NCNN_shader_local_memory
+shared ivec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH];
+shared ivec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
+#endif
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x) * 4;
+    const int gy = int(gl_GlobalInvocationID.y);
+
+    const int outsize = psc(outw) * psc(outh);
+
+#if !NCNN_shader_local_memory
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+#endif
+
+    const int maxk = kernel_w * kernel_h;
+    const int N = psc(c) * maxk;
+    const ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
+
+    const ivec4 sy4 = gx4 / psc(outw);
+    const ivec4 sx4 = gx4 % psc(outw);
+
+    const ivec4 sxs4 = sx4 * stride_w;
+    const ivec4 sys4 = sy4 * stride_h;
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+
+    int w_offset = gy * N * 4;
+
+#if NCNN_shader_local_memory
+    const int lx = int(gl_LocalInvocationID.x);
+    const int ly = int(gl_LocalInvocationID.y);
+
+    int z = 0;
+    for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < N; z += LOCAL_MEMORY_UNROLL_INCH)
+    {
+        if (ly < LOCAL_MEMORY_UNROLL_INCH)
+        {
+            const int zz = z + ly;
+            const int sz = zz / maxk;
+            const int k = zz - sz * maxk;
+            const int ky = k / kernel_w;
+            const int kx = k - ky * kernel_w;
+
+            int v0p = 0;
+            int v1p = 0;
+            int v2p = 0;
+            int v3p = 0;
+
+            if (elempack == 4)
+            {
+                const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+                if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r);
+                if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g);
+                if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b);
+                if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a);
+            }
+            else // elempack == 1
+            {
+                const int ch0 = sz * 4;
+                const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+                if (gx4.r < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r);
+
+                    v0p = packInt4x8(v);
+                }
+                if (gx4.g < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g);
+
+                    v1p = packInt4x8(v);
+                }
+                if (gx4.b < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b);
+
+                    v2p = packInt4x8(v);
+                }
+                if (gx4.a < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a);
+
+                    v3p = packInt4x8(v);
+                }
+            }
+
+            tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p);
+        }
+
+        if (lx < LOCAL_MEMORY_UNROLL_INCH)
+        {
+            tmp_k[ly][lx] = weight_data[w_offset / 4 + lx];
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
+        {
+            const ivec4 vp = tmp_v[lx][z4];
+            const int v0p = vp.r;
+            const int v1p = vp.g;
+            const int v2p = vp.b;
+            const int v3p = vp.a;
+
+            const ivec4 kp = tmp_k[ly][z4];
+            const int k0p = kp.r;
+            const int k1p = kp.g;
+            const int k2p = kp.b;
+            const int k3p = kp.a;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+            sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+            sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+            sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+            const ivec4 v0 = unpackInt4x8(v0p);
+            const ivec4 v1 = unpackInt4x8(v1p);
+            const ivec4 v2 = unpackInt4x8(v2p);
+            const ivec4 v3 = unpackInt4x8(v3p);
+
+            const ivec4 k0 = unpackInt4x8(k0p);
+            const ivec4 k1 = unpackInt4x8(k1p);
+            const ivec4 k2 = unpackInt4x8(k2p);
+            const ivec4 k3 = unpackInt4x8(k3p);
+
+            sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                          v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                          v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                          v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+            sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                          v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                          v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                          v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+            sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                          v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                          v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                          v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+            sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                          v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                          v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                          v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+        }
+
+        w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (ly < remain)
+        {
+            const int zz = z + ly;
+            const int sz = zz / maxk;
+            const int k = zz - sz * maxk;
+            const int ky = k / kernel_w;
+            const int kx = k - ky * kernel_w;
+
+            int v0p = 0;
+            int v1p = 0;
+            int v2p = 0;
+            int v3p = 0;
+
+            if (elempack == 4)
+            {
+                const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+                if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r);
+                if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g);
+                if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b);
+                if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a);
+            }
+            else // elempack == 1
+            {
+                const int ch0 = sz * 4;
+                const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+                if (gx4.r < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r);
+
+                    v0p = packInt4x8(v);
+                }
+                if (gx4.g < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g);
+
+                    v1p = packInt4x8(v);
+                }
+                if (gx4.b < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b);
+
+                    v2p = packInt4x8(v);
+                }
+                if (gx4.a < outsize)
+                {
+                    ivec4 v = ivec4(0);
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a);
+                    if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a);
+                    if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a);
+                    if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a);
+
+                    v3p = packInt4x8(v);
+                }
+            }
+
+            tmp_v[lx][ly] = ivec4(v0p, v1p, v2p, v3p);
+        }
+
+        if (lx < remain)
+        {
+            tmp_k[ly][lx] = weight_data[w_offset / 4 + lx];
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            const ivec4 vp = tmp_v[lx][z4];
+            const int v0p = vp.r;
+            const int v1p = vp.g;
+            const int v2p = vp.b;
+            const int v3p = vp.a;
+
+            const ivec4 kp = tmp_k[ly][z4];
+            const int k0p = kp.r;
+            const int k1p = kp.g;
+            const int k2p = kp.b;
+            const int k3p = kp.a;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+            sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+            sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+            sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+            const ivec4 v0 = unpackInt4x8(v0p);
+            const ivec4 v1 = unpackInt4x8(v1p);
+            const ivec4 v2 = unpackInt4x8(v2p);
+            const ivec4 v3 = unpackInt4x8(v3p);
+
+            const ivec4 k0 = unpackInt4x8(k0p);
+            const ivec4 k1 = unpackInt4x8(k1p);
+            const ivec4 k2 = unpackInt4x8(k2p);
+            const ivec4 k3 = unpackInt4x8(k3p);
+
+            sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                          v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                          v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                          v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+            sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                          v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                          v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                          v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+            sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                          v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                          v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                          v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+            sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                          v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                          v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                          v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+        }
+    }
+#else
+    for (int z = 0; z < N; z++)
+    {
+        const int sz = z / maxk;
+        const int k = z - sz * maxk;
+        const int ky = k / kernel_w;
+        const int kx = k - ky * kernel_w;
+
+        int v0p = 0;
+        int v1p = 0;
+        int v2p = 0;
+        int v3p = 0;
+
+        if (elempack == 4)
+        {
+            const ivec4 v_offset = sz * psc(cstep) + (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+            if (gx4.r < outsize) v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r);
+            if (gx4.g < outsize) v1p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g);
+            if (gx4.b < outsize) v2p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b);
+            if (gx4.a < outsize) v3p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a);
+        }
+        else // elempack == 1
+        {
+            const int ch0 = sz * 4;
+            const ivec4 base_spatial = (sys4 + ky * dilation_h) * psc(w) + sxs4 + kx * dilation_w;
+
+            if (gx4.r < outsize)
+            {
+                ivec4 v = ivec4(0);
+                v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.r);
+                if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.r);
+                if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.r);
+                if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.r);
+                v0p = packInt4x8(v);
+            }
+            if (gx4.g < outsize)
+            {
+                ivec4 v = ivec4(0);
+                v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.g);
+                if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.g);
+                if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.g);
+                if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.g);
+                v1p = packInt4x8(v);
+            }
+            if (gx4.b < outsize)
+            {
+                ivec4 v = ivec4(0);
+                v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.b);
+                if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.b);
+                if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.b);
+                if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.b);
+                v2p = packInt4x8(v);
+            }
+            if (gx4.a < outsize)
+            {
+                ivec4 v = ivec4(0);
+                v.r = i8buffer_ld1(bottom_blob_int8_data_1, ch0 * psc(cstep) + base_spatial.a);
+                if (ch0 + 1 < num_input) v.g = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 1) * psc(cstep) + base_spatial.a);
+                if (ch0 + 2 < num_input) v.b = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 2) * psc(cstep) + base_spatial.a);
+                if (ch0 + 3 < num_input) v.a = i8buffer_ld1(bottom_blob_int8_data_1, (ch0 + 3) * psc(cstep) + base_spatial.a);
+                v3p = packInt4x8(v);
+            }
+        }
+
+        const ivec4 kp = weight_data[w_offset / 4];
+        const int k0p = kp.r;
+        const int k1p = kp.g;
+        const int k2p = kp.b;
+        const int k3p = kp.a;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+        sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+        sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+        sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+#else
+        const ivec4 v0 = unpackInt4x8(v0p);
+        const ivec4 v1 = unpackInt4x8(v1p);
+        const ivec4 v2 = unpackInt4x8(v2p);
+        const ivec4 v3 = unpackInt4x8(v3p);
+
+        const ivec4 k0 = unpackInt4x8(kp.r);
+        const ivec4 k1 = unpackInt4x8(kp.g);
+        const ivec4 k2 = unpackInt4x8(kp.b);
+        const ivec4 k3 = unpackInt4x8(kp.a);
+
+        sum0 += ivec4(v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a,
+                      v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a,
+                      v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a,
+                      v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a);
+        sum1 += ivec4(v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a,
+                      v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a,
+                      v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a,
+                      v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a);
+        sum2 += ivec4(v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a,
+                      v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a,
+                      v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a,
+                      v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a);
+        sum3 += ivec4(v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a,
+                      v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a,
+                      v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a,
+                      v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a);
+#endif
+
+        w_offset += 4;
+    }
+#endif
+
+#if NCNN_shader_local_memory
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+#endif
+
+    const int base_ch = gy * 4;
+    const int nout = num_output;
+    const vec4 descale = weight_descales_data[gy];
+
+    vec4 sumfp32_0 = vec4(sum0) * descale;
+    vec4 sumfp32_1 = vec4(sum1) * descale;
+    vec4 sumfp32_2 = vec4(sum2) * descale;
+    vec4 sumfp32_3 = vec4(sum3) * descale;
+
+    if (bias_term == 1)
+    {
+        const vec4 bias = bias_data[gy];
+
+        sumfp32_0 += bias;
+        sumfp32_1 += bias;
+        sumfp32_2 += bias;
+        sumfp32_3 += bias;
+    }
+
+    sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1));
+    sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1));
+    sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1));
+    sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1));
+
+    if (use_int8_requantize == 1)
+    {
+        const float top_scale = buffer_ld1(top_scales_data, 0);
+        sumfp32_0 *= top_scale;
+        sumfp32_1 *= top_scale;
+        sumfp32_2 *= top_scale;
+        sumfp32_3 *= top_scale;
+
+        const ivec4 v0 = float2int8vec4(sumfp32_0);
+        const ivec4 v1 = float2int8vec4(sumfp32_1);
+        const ivec4 v2 = float2int8vec4(sumfp32_2);
+        const ivec4 v3 = float2int8vec4(sumfp32_3);
+
+        if (out_elempack == 4)
+        {
+            const int gi = gy * psc(outcstep) + gx;
+
+            i8buffer_st4(top_blob_int8_data_4, gi, v0);
+            if (gx + 1 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 1, v1);
+            if (gx + 2 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 2, v2);
+            if (gx + 3 < outsize) i8buffer_st4(top_blob_int8_data_4, gi + 3, v3);
+        }
+        else // out_elempack == 1
+        {
+            const int channel_step = psc(outcstep) / 4;
+            const int gi = gy * psc(outcstep) + gx;
+
+            i8buffer_st1(top_blob_int8_data_1, gi, v0.r);
+            if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step, v0.g);
+            if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 2, v0.b);
+            if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 3, v0.a);
+
+            if (gx + 1 < outsize)
+            {
+                i8buffer_st1(top_blob_int8_data_1, gi + 1, v1.r);
+                if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step, v1.g);
+                if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step * 2, v1.b);
+                if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 1 + channel_step * 3, v1.a);
+            }
+            if (gx + 2 < outsize)
+            {
+                i8buffer_st1(top_blob_int8_data_1, gi + 2, v2.r);
+                if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step, v2.g);
+                if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step * 2, v2.b);
+                if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 2 + channel_step * 3, v2.a);
+            }
+            if (gx + 3 < outsize)
+            {
+                i8buffer_st1(top_blob_int8_data_1, gi + 3, v3.r);
+                if (base_ch + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step, v3.g);
+                if (base_ch + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step * 2, v3.b);
+                if (base_ch + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi + 3 + channel_step * 3, v3.a);
+            }
+        }
+    }
+    else
+    {
+        if (out_elempack == 4)
+        {
+            const int gi = gy * psc(outcstep) + gx;
+
+            buffer_st4(top_blob_data_4, gi, afpvec4(sumfp32_0));
+            if (gx + 1 < outsize) buffer_st4(top_blob_data_4, gi + 1, afpvec4(sumfp32_1));
+            if (gx + 2 < outsize) buffer_st4(top_blob_data_4, gi + 2, afpvec4(sumfp32_2));
+            if (gx + 3 < outsize) buffer_st4(top_blob_data_4, gi + 3, afpvec4(sumfp32_3));
+        }
+        else // out_elempack == 1
+        {
+            const int channel_step = psc(outcstep) / 4;
+            const int gi = gy * psc(outcstep) + gx;
+
+            buffer_st1(top_blob_data_1, gi, afp(sumfp32_0.r));
+            if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + channel_step, afp(sumfp32_0.g));
+            if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + channel_step * 2, afp(sumfp32_0.b));
+            if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + channel_step * 3, afp(sumfp32_0.a));
+
+            if (gx + 1 < outsize)
+            {
+                buffer_st1(top_blob_data_1, gi + 1, afp(sumfp32_1.r));
+                if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step, afp(sumfp32_1.g));
+                if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step * 2, afp(sumfp32_1.b));
+                if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 1 + channel_step * 3, afp(sumfp32_1.a));
+            }
+            if (gx + 2 < outsize)
+            {
+                buffer_st1(top_blob_data_1, gi + 2, afp(sumfp32_2.r));
+                if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step, afp(sumfp32_2.g));
+                if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step * 2, afp(sumfp32_2.b));
+                if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 2 + channel_step * 3, afp(sumfp32_2.a));
+            }
+            if (gx + 3 < outsize)
+            {
+                buffer_st1(top_blob_data_1, gi + 3, afp(sumfp32_3.r));
+                if (base_ch + 1 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step, afp(sumfp32_3.g));
+                if (base_ch + 2 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step * 2, afp(sumfp32_3.b));
+                if (base_ch + 3 < nout) buffer_st1(top_blob_data_1, gi + 3 + channel_step * 3, afp(sumfp32_3.a));
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_packed_int8.comp b/src/layer/vulkan/shader/convolution_packed_int8.comp
new file mode 100644
index 000000000000..42faac9b33dc
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_packed_int8.comp
@@ -0,0 +1,439 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int kernel_w = 1;
+layout(constant_id = 1) const int kernel_h = 1;
+layout(constant_id = 2) const int dilation_w = 1;
+layout(constant_id = 3) const int dilation_h = 1;
+layout(constant_id = 4) const int stride_w = 1;
+layout(constant_id = 5) const int stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int activation_type = 0;
+layout(constant_id = 8) const float activation_param_0 = 0;
+layout(constant_id = 9) const float activation_param_1 = 0;
+layout(constant_id = 10) const int use_int8_requantize = 0;
+layout(constant_id = 11) const int elempack = 1;
+layout(constant_id = 12) const int out_elempack = 1;
+
+#define shape_constant_id_offset 13
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int num_output = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int num_input = 0;
+
+// scalar view  (for pack1 input/output access)
+layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; };
+layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; };
+
+// vec4 view  (for pack4 input/output access, weight/bias always vec4)
+layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; };
+layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; };
+layout(binding = 4) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 6) readonly buffer weight_descales_blob { vec4 weight_descales_data[]; };
+layout(binding = 7) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; };
+layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+    int gz = int(gl_GlobalInvocationID.z) * 2;
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int maxk = kernel_w * kernel_h;
+    ivec2 gx2 = gx + ivec2(0, 1);
+    ivec2 gy2 = gy + ivec2(0, 1);
+    ivec2 gz2 = gz + ivec2(0, 1);
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+    ivec4 sum4 = ivec4(0);
+    ivec4 sum5 = ivec4(0);
+    ivec4 sum6 = ivec4(0);
+    ivec4 sum7 = ivec4(0);
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        if (elempack == 4)
+        {
+            ivec4 v_offset;
+            v_offset.rg = z * psc(cstep) + gy2.x * stride_h * psc(w) + gx2 * stride_w;
+            v_offset.ba = z * psc(cstep) + gy2.y * stride_h * psc(w) + gx2 * stride_w;
+
+            ivec2 w_offset = (gz2 * psc(c) + z) * maxk * 4;
+
+            for (int y = 0; y < kernel_h; y++)
+            {
+                for (int x = 0; x < kernel_w; x++)
+                {
+                    int v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset.r + x * dilation_w);
+                    int v1p = gx + 1 < psc(outw) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.g + x * dilation_w) : 0;
+                    int v2p = gy + 1 < psc(outh) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.b + x * dilation_w) : 0;
+                    int v3p = gx + 1 < psc(outw) && gy + 1 < psc(outh) ? i8buffer_sm4(bottom_blob_int8_data_4, v_offset.a + x * dilation_w) : 0;
+
+                    int k0p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 0);
+                    int k1p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 1);
+                    int k2p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 2);
+                    int k3p = i8buffer_sm4(weight_data, w_offset.x + x * 4 + 3);
+                    int k4p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 0);
+                    int k5p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 1);
+                    int k6p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 2);
+                    int k7p = i8buffer_sm4(weight_data, w_offset.y + x * 4 + 3);
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+                    sum0 += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+                    sum1 += ivec4(dotPacked4x8EXT(v1p, k0p), dotPacked4x8EXT(v1p, k1p), dotPacked4x8EXT(v1p, k2p), dotPacked4x8EXT(v1p, k3p));
+                    sum2 += ivec4(dotPacked4x8EXT(v2p, k0p), dotPacked4x8EXT(v2p, k1p), dotPacked4x8EXT(v2p, k2p), dotPacked4x8EXT(v2p, k3p));
+                    sum3 += ivec4(dotPacked4x8EXT(v3p, k0p), dotPacked4x8EXT(v3p, k1p), dotPacked4x8EXT(v3p, k2p), dotPacked4x8EXT(v3p, k3p));
+                    sum4 += ivec4(dotPacked4x8EXT(v0p, k4p), dotPacked4x8EXT(v0p, k5p), dotPacked4x8EXT(v0p, k6p), dotPacked4x8EXT(v0p, k7p));
+                    sum5 += ivec4(dotPacked4x8EXT(v1p, k4p), dotPacked4x8EXT(v1p, k5p), dotPacked4x8EXT(v1p, k6p), dotPacked4x8EXT(v1p, k7p));
+                    sum6 += ivec4(dotPacked4x8EXT(v2p, k4p), dotPacked4x8EXT(v2p, k5p), dotPacked4x8EXT(v2p, k6p), dotPacked4x8EXT(v2p, k7p));
+                    sum7 += ivec4(dotPacked4x8EXT(v3p, k4p), dotPacked4x8EXT(v3p, k5p), dotPacked4x8EXT(v3p, k6p), dotPacked4x8EXT(v3p, k7p));
+#else
+                    ivec4 v0 = unpackInt4x8(v0p);
+                    ivec4 v1 = unpackInt4x8(v1p);
+                    ivec4 v2 = unpackInt4x8(v2p);
+                    ivec4 v3 = unpackInt4x8(v3p);
+                    ivec4 k0 = unpackInt4x8(k0p);
+                    ivec4 k1 = unpackInt4x8(k1p);
+                    ivec4 k2 = unpackInt4x8(k2p);
+                    ivec4 k3 = unpackInt4x8(k3p);
+                    ivec4 k4 = unpackInt4x8(k4p);
+                    ivec4 k5 = unpackInt4x8(k5p);
+                    ivec4 k6 = unpackInt4x8(k6p);
+                    ivec4 k7 = unpackInt4x8(k7p);
+
+                    sum0.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a;
+                    sum0.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a;
+                    sum0.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a;
+                    sum0.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a;
+                    sum1.r += v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a;
+                    sum1.g += v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a;
+                    sum1.b += v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a;
+                    sum1.a += v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a;
+                    sum2.r += v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a;
+                    sum2.g += v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a;
+                    sum2.b += v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a;
+                    sum2.a += v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a;
+                    sum3.r += v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a;
+                    sum3.g += v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a;
+                    sum3.b += v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a;
+                    sum3.a += v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a;
+                    sum4.r += v0.r * k4.r + v0.g * k4.g + v0.b * k4.b + v0.a * k4.a;
+                    sum4.g += v0.r * k5.r + v0.g * k5.g + v0.b * k5.b + v0.a * k5.a;
+                    sum4.b += v0.r * k6.r + v0.g * k6.g + v0.b * k6.b + v0.a * k6.a;
+                    sum4.a += v0.r * k7.r + v0.g * k7.g + v0.b * k7.b + v0.a * k7.a;
+                    sum5.r += v1.r * k4.r + v1.g * k4.g + v1.b * k4.b + v1.a * k4.a;
+                    sum5.g += v1.r * k5.r + v1.g * k5.g + v1.b * k5.b + v1.a * k5.a;
+                    sum5.b += v1.r * k6.r + v1.g * k6.g + v1.b * k6.b + v1.a * k6.a;
+                    sum5.a += v1.r * k7.r + v1.g * k7.g + v1.b * k7.b + v1.a * k7.a;
+                    sum6.r += v2.r * k4.r + v2.g * k4.g + v2.b * k4.b + v2.a * k4.a;
+                    sum6.g += v2.r * k5.r + v2.g * k5.g + v2.b * k5.b + v2.a * k5.a;
+                    sum6.b += v2.r * k6.r + v2.g * k6.g + v2.b * k6.b + v2.a * k6.a;
+                    sum6.a += v2.r * k7.r + v2.g * k7.g + v2.b * k7.b + v2.a * k7.a;
+                    sum7.r += v3.r * k4.r + v3.g * k4.g + v3.b * k4.b + v3.a * k4.a;
+                    sum7.g += v3.r * k5.r + v3.g * k5.g + v3.b * k5.b + v3.a * k5.a;
+                    sum7.b += v3.r * k6.r + v3.g * k6.g + v3.b * k6.b + v3.a * k6.a;
+                    sum7.a += v3.r * k7.r + v3.g * k7.g + v3.b * k7.b + v3.a * k7.a;
+#endif
+                }
+
+                v_offset += dilation_h * psc(w);
+                w_offset += kernel_w * 4;
+            }
+        }
+        else // elempack == 1
+        {
+            ivec2 w_offset = (gz2 * psc(c) + z) * maxk;
+
+            for (int y = 0; y < kernel_h; y++)
+            {
+                ivec4 v_offset;
+                v_offset.rg = z * psc(cstep) + gy2.x * stride_h * psc(w) + gx2 * stride_w + y * dilation_h * psc(w);
+                v_offset.ba = z * psc(cstep) + gy2.y * stride_h * psc(w) + gx2 * stride_w + y * dilation_h * psc(w);
+
+                for (int x = 0; x < kernel_w; x++)
+                {
+                    int v0 = i8buffer_ld1(bottom_blob_int8_data_1, v_offset.r + x * dilation_w);
+                    int v1 = gx + 1 < psc(outw) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.g + x * dilation_w) : 0;
+                    int v2 = gy + 1 < psc(outh) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.b + x * dilation_w) : 0;
+                    int v3 = gx + 1 < psc(outw) && gy + 1 < psc(outh) ? i8buffer_ld1(bottom_blob_int8_data_1, v_offset.a + x * dilation_w) : 0;
+
+                    int k0p = i8buffer_sm4(weight_data, w_offset.x + x);
+                    int k1p = i8buffer_sm4(weight_data, w_offset.y + x);
+
+                    ivec4 k0 = unpackInt4x8(k0p);
+                    ivec4 k1 = unpackInt4x8(k1p);
+
+                    sum0 += v0 * k0;
+                    sum1 += v1 * k0;
+                    sum2 += v2 * k0;
+                    sum3 += v3 * k0;
+                    sum4 += v0 * k1;
+                    sum5 += v1 * k1;
+                    sum6 += v2 * k1;
+                    sum7 += v3 * k1;
+                }
+
+                w_offset += kernel_w;
+            }
+        }
+    }
+
+    int nout = num_output;
+    ivec2 base_ch = gz2 * 4;
+
+    const vec4 descale0 = weight_descales_data[gz2.x];
+    const vec4 descale1 = weight_descales_data[gz2.y];
+
+    vec4 sumfp32_0 = vec4(sum0) * descale0;
+    vec4 sumfp32_1 = vec4(sum1) * descale0;
+    vec4 sumfp32_2 = vec4(sum2) * descale0;
+    vec4 sumfp32_3 = vec4(sum3) * descale0;
+    vec4 sumfp32_4 = vec4(sum4) * descale1;
+    vec4 sumfp32_5 = vec4(sum5) * descale1;
+    vec4 sumfp32_6 = vec4(sum6) * descale1;
+    vec4 sumfp32_7 = vec4(sum7) * descale1;
+
+    if (bias_term == 1)
+    {
+        vec4 bias0 = bias_data[gz2.x];
+        vec4 bias1 = bias_data[gz2.y];
+
+        sumfp32_0 += bias0;
+        sumfp32_1 += bias0;
+        sumfp32_2 += bias0;
+        sumfp32_3 += bias0;
+        sumfp32_4 += bias1;
+        sumfp32_5 += bias1;
+        sumfp32_6 += bias1;
+        sumfp32_7 += bias1;
+    }
+
+    sumfp32_0 = vec4(activation_afpvec4(afpvec4(sumfp32_0), activation_type, activation_param_0, activation_param_1));
+    sumfp32_1 = vec4(activation_afpvec4(afpvec4(sumfp32_1), activation_type, activation_param_0, activation_param_1));
+    sumfp32_2 = vec4(activation_afpvec4(afpvec4(sumfp32_2), activation_type, activation_param_0, activation_param_1));
+    sumfp32_3 = vec4(activation_afpvec4(afpvec4(sumfp32_3), activation_type, activation_param_0, activation_param_1));
+    sumfp32_4 = vec4(activation_afpvec4(afpvec4(sumfp32_4), activation_type, activation_param_0, activation_param_1));
+    sumfp32_5 = vec4(activation_afpvec4(afpvec4(sumfp32_5), activation_type, activation_param_0, activation_param_1));
+    sumfp32_6 = vec4(activation_afpvec4(afpvec4(sumfp32_6), activation_type, activation_param_0, activation_param_1));
+    sumfp32_7 = vec4(activation_afpvec4(afpvec4(sumfp32_7), activation_type, activation_param_0, activation_param_1));
+
+    if (use_int8_requantize == 1)
+    {
+        float top_scale = buffer_ld1(top_scales_data, 0);
+        sumfp32_0 *= top_scale;
+        sumfp32_1 *= top_scale;
+        sumfp32_2 *= top_scale;
+        sumfp32_3 *= top_scale;
+        sumfp32_4 *= top_scale;
+        sumfp32_5 *= top_scale;
+        sumfp32_6 *= top_scale;
+        sumfp32_7 *= top_scale;
+
+        ivec4 v0 = float2int8vec4(sumfp32_0);
+        ivec4 v1 = float2int8vec4(sumfp32_1);
+        ivec4 v2 = float2int8vec4(sumfp32_2);
+        ivec4 v3 = float2int8vec4(sumfp32_3);
+        ivec4 v4 = float2int8vec4(sumfp32_4);
+        ivec4 v5 = float2int8vec4(sumfp32_5);
+        ivec4 v6 = float2int8vec4(sumfp32_6);
+        ivec4 v7 = float2int8vec4(sumfp32_7);
+
+        if (out_elempack == 4)
+        {
+            ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx;
+
+            i8buffer_st4(top_blob_int8_data_4, gi.x, v0);
+            if (gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.x + 1, v1);
+            if (gy + 1 < psc(outh)) i8buffer_st4(top_blob_int8_data_4, gi.x + psc(outw), v2);
+            if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.x + psc(outw) + 1, v3);
+            if (gz + 1 < psc(outc))
+            {
+                i8buffer_st4(top_blob_int8_data_4, gi.y, v4);
+                if (gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.y + 1, v5);
+                if (gy + 1 < psc(outh)) i8buffer_st4(top_blob_int8_data_4, gi.y + psc(outw), v6);
+                if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) i8buffer_st4(top_blob_int8_data_4, gi.y + psc(outw) + 1, v7);
+            }
+        }
+        else // out_elempack == 1
+        {
+            int channel_step = psc(outcstep) / 4;
+            ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx;
+
+            i8buffer_st1(top_blob_int8_data_1, gi.x, v0.r);
+            if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step, v0.g);
+            if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step * 2, v0.b);
+            if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + channel_step * 3, v0.a);
+            if (gx + 1 < psc(outw))
+            {
+                i8buffer_st1(top_blob_int8_data_1, gi.x + 1, v1.r);
+                if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step, v1.g);
+                if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step * 2, v1.b);
+                if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.x + 1 + channel_step * 3, v1.a);
+            }
+            if (gy + 1 < psc(outh))
+            {
+                int gi2 = gi.x + psc(outw);
+                i8buffer_st1(top_blob_int8_data_1, gi2, v2.r);
+                if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step, v2.g);
+                if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step * 2, v2.b);
+                if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi2 + channel_step * 3, v2.a);
+            }
+            if (gy + 1 < psc(outh) && gx + 1 < psc(outw))
+            {
+                int gi3 = gi.x + psc(outw) + 1;
+                i8buffer_st1(top_blob_int8_data_1, gi3, v3.r);
+                if (base_ch.x + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step, v3.g);
+                if (base_ch.x + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step * 2, v3.b);
+                if (base_ch.x + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi3 + channel_step * 3, v3.a);
+            }
+            if (gz + 1 < psc(outc))
+            {
+                if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi.y, v4.r);
+                if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step, v4.g);
+                if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step * 2, v4.b);
+                if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + channel_step * 3, v4.a);
+                if (gx + 1 < psc(outw))
+                {
+                    if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1, v5.r);
+                    if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step, v5.g);
+                    if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step * 2, v5.b);
+                    if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi.y + 1 + channel_step * 3, v5.a);
+                }
+                if (gy + 1 < psc(outh))
+                {
+                    int gi6 = gi.y + psc(outw);
+                    if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi6, v6.r);
+                    if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step, v6.g);
+                    if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step * 2, v6.b);
+                    if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi6 + channel_step * 3, v6.a);
+                }
+                if (gy + 1 < psc(outh) && gx + 1 < psc(outw))
+                {
+                    int gi7 = gi.y + psc(outw) + 1;
+                    if (base_ch.y < nout) i8buffer_st1(top_blob_int8_data_1, gi7, v7.r);
+                    if (base_ch.y + 1 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step, v7.g);
+                    if (base_ch.y + 2 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step * 2, v7.b);
+                    if (base_ch.y + 3 < nout) i8buffer_st1(top_blob_int8_data_1, gi7 + channel_step * 3, v7.a);
+                }
+            }
+        }
+    }
+    else
+    {
+        if (out_elempack == 4)
+        {
+            ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data_4, gi.x, afpvec4(sumfp32_0));
+            if (gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.x + 1, afpvec4(sumfp32_1));
+            if (gy + 1 < psc(outh)) buffer_st4(top_blob_data_4, gi.x + psc(outw), afpvec4(sumfp32_2));
+            if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.x + psc(outw) + 1, afpvec4(sumfp32_3));
+            if (gz + 1 < psc(outc))
+            {
+                buffer_st4(top_blob_data_4, gi.y, afpvec4(sumfp32_4));
+                if (gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.y + 1, afpvec4(sumfp32_5));
+                if (gy + 1 < psc(outh)) buffer_st4(top_blob_data_4, gi.y + psc(outw), afpvec4(sumfp32_6));
+                if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data_4, gi.y + psc(outw) + 1, afpvec4(sumfp32_7));
+            }
+        }
+        else // out_elempack == 1
+        {
+            int channel_step = psc(outcstep) / 4;
+            ivec2 gi = gz2 * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data_1, gi.x, afp(sumfp32_0.r));
+            if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step, afp(sumfp32_0.g));
+            if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step * 2, afp(sumfp32_0.b));
+            if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi.x + channel_step * 3, afp(sumfp32_0.a));
+            if (gx + 1 < psc(outw))
+            {
+                buffer_st1(top_blob_data_1, gi.x + 1, afp(sumfp32_1.r));
+                if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step, afp(sumfp32_1.g));
+                if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step * 2, afp(sumfp32_1.b));
+                if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi.x + 1 + channel_step * 3, afp(sumfp32_1.a));
+            }
+            if (gy + 1 < psc(outh))
+            {
+                int gi2 = gi.x + psc(outw);
+                buffer_st1(top_blob_data_1, gi2, afp(sumfp32_2.r));
+                if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step, afp(sumfp32_2.g));
+                if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step * 2, afp(sumfp32_2.b));
+                if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi2 + channel_step * 3, afp(sumfp32_2.a));
+            }
+            if (gy + 1 < psc(outh) && gx + 1 < psc(outw))
+            {
+                int gi3 = gi.x + psc(outw) + 1;
+                buffer_st1(top_blob_data_1, gi3, afp(sumfp32_3.r));
+                if (base_ch.x + 1 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step, afp(sumfp32_3.g));
+                if (base_ch.x + 2 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step * 2, afp(sumfp32_3.b));
+                if (base_ch.x + 3 < nout) buffer_st1(top_blob_data_1, gi3 + channel_step * 3, afp(sumfp32_3.a));
+            }
+            if (gz + 1 < psc(outc))
+            {
+                if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi.y, afp(sumfp32_4.r));
+                if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step, afp(sumfp32_4.g));
+                if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step * 2, afp(sumfp32_4.b));
+                if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi.y + channel_step * 3, afp(sumfp32_4.a));
+                if (gx + 1 < psc(outw))
+                {
+                    if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi.y + 1, afp(sumfp32_5.r));
+                    if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step, afp(sumfp32_5.g));
+                    if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step * 2, afp(sumfp32_5.b));
+                    if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi.y + 1 + channel_step * 3, afp(sumfp32_5.a));
+                }
+                if (gy + 1 < psc(outh))
+                {
+                    int gi6 = gi.y + psc(outw);
+                    if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi6, afp(sumfp32_6.r));
+                    if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step, afp(sumfp32_6.g));
+                    if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step * 2, afp(sumfp32_6.b));
+                    if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi6 + channel_step * 3, afp(sumfp32_6.a));
+                }
+                if (gy + 1 < psc(outh) && gx + 1 < psc(outw))
+                {
+                    int gi7 = gi.y + psc(outw) + 1;
+                    if (base_ch.y < nout) buffer_st1(top_blob_data_1, gi7, afp(sumfp32_7.r));
+                    if (base_ch.y + 1 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step, afp(sumfp32_7.g));
+                    if (base_ch.y + 2 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step * 2, afp(sumfp32_7.b));
+                    if (base_ch.y + 3 < nout) buffer_st1(top_blob_data_1, gi7 + channel_step * 3, afp(sumfp32_7.a));
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp
new file mode 100644
index 000000000000..ab4782d2da26
--- /dev/null
+++ b/src/layer/vulkan/shader/convolutiondepthwise_group_packed_int8.comp
@@ -0,0 +1,202 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int kernel_w = 1;
+layout(constant_id = 1) const int kernel_h = 1;
+layout(constant_id = 2) const int dilation_w = 1;
+layout(constant_id = 3) const int dilation_h = 1;
+layout(constant_id = 4) const int stride_w = 1;
+layout(constant_id = 5) const int stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int group = 1;
+layout(constant_id = 8) const int activation_type = 0;
+layout(constant_id = 9) const float activation_param_0 = 0;
+layout(constant_id = 10) const float activation_param_1 = 0;
+layout(constant_id = 11) const int use_int8_requantize = 0;
+layout(constant_id = 12) const int elempack = 1;
+layout(constant_id = 13) const int out_elempack = 1;
+layout(constant_id = 14) const int num_output_g = 1;
+
+#define shape_constant_id_offset 15
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; };
+layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; };
+layout(binding = 2) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; };
+layout(binding = 3) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; };
+layout(binding = 4) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 5) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 6) readonly buffer descales_blob { vec4 descales_data[]; };
+layout(binding = 7) readonly buffer top_scales_blob { sfpvec4 top_scales_data[]; };
+layout(binding = 8) writeonly buffer top_blob_int8_1 { sint8 top_blob_int8_data_1[]; };
+layout(binding = 9) writeonly buffer top_blob_int8_4 { sint8vec4 top_blob_int8_data_4[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    int outc_g = (num_output_g + 3) / 4;
+    int gg = gz / outc_g;
+    gz = gz - gg * outc_g;
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gg >= group)
+        return;
+
+    int maxk = kernel_w * kernel_h;
+    int channels_g = psc(c) / group;
+    int outc_g_aligned = (outc_g + 7) / 8 * 8;
+    int gz_aligned = gg * outc_g_aligned + gz;
+    int top_z = gg * outc_g + gz;
+
+    ivec4 sum = ivec4(0);
+
+    // group convolution
+    for (int z = 0; z < channels_g; z++)
+    {
+        if (elempack == 4)
+        {
+            int v_offset = (gg * channels_g + z) * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+            int w_offset = (gz_aligned * channels_g + z) * maxk * 4;
+
+            for (int y = 0; y < kernel_h; y++)
+            {
+                for (int x = 0; x < kernel_w; x++)
+                {
+                    int v0p = i8buffer_sm4(bottom_blob_int8_data_4, v_offset + x * dilation_w);
+                    int k0p = i8buffer_sm4(weight_data, w_offset + x * 4 + 0);
+                    int k1p = i8buffer_sm4(weight_data, w_offset + x * 4 + 1);
+                    int k2p = i8buffer_sm4(weight_data, w_offset + x * 4 + 2);
+                    int k3p = i8buffer_sm4(weight_data, w_offset + x * 4 + 3);
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+                    sum += ivec4(dotPacked4x8EXT(v0p, k0p), dotPacked4x8EXT(v0p, k1p), dotPacked4x8EXT(v0p, k2p), dotPacked4x8EXT(v0p, k3p));
+#else
+                    ivec4 v0 = unpackInt4x8(v0p);
+                    ivec4 k0 = unpackInt4x8(k0p);
+                    ivec4 k1 = unpackInt4x8(k1p);
+                    ivec4 k2 = unpackInt4x8(k2p);
+                    ivec4 k3 = unpackInt4x8(k3p);
+
+                    sum.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a;
+                    sum.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a;
+                    sum.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a;
+                    sum.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a;
+#endif
+                }
+
+                v_offset += dilation_h * psc(w);
+                w_offset += kernel_w * 4;
+            }
+        }
+        else // elempack == 1
+        {
+            int w_offset = (gz_aligned * channels_g + z) * maxk;
+
+            for (int y = 0; y < kernel_h; y++)
+            {
+                int v_offset = (gg * channels_g + z) * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w + y * dilation_h * psc(w);
+
+                for (int x = 0; x < kernel_w; x++)
+                {
+                    int v0 = i8buffer_ld1(bottom_blob_int8_data_1, v_offset + x * dilation_w);
+                    int k0p = i8buffer_sm4(weight_data, w_offset + x);
+                    ivec4 k0 = unpackInt4x8(k0p);
+
+                    sum += v0 * k0;
+                }
+
+                w_offset += kernel_w;
+            }
+        }
+    }
+
+    vec4 sumfp32 = vec4(sum) * descales_data[gz_aligned];
+
+    if (bias_term == 1)
+    {
+        sumfp32 += bias_data[gz_aligned];
+    }
+
+    sumfp32 = vec4(activation_afpvec4(afpvec4(sumfp32), activation_type, activation_param_0, activation_param_1));
+
+    int outch0 = gz * 4;
+
+    if (use_int8_requantize == 1)
+    {
+        vec4 top_scale = vec4(buffer_ld4(top_scales_data, gz_aligned));
+        sumfp32 *= top_scale;
+        ivec4 v = float2int8vec4(sumfp32);
+
+        if (out_elempack == 4)
+        {
+            int gi = top_z * psc(outcstep) + gy * psc(outw) + gx;
+            i8buffer_st4(top_blob_int8_data_4, gi, v);
+        }
+        else // out_elempack == 1
+        {
+            int channel_step = psc(outcstep) / 4;
+            int base_ch = gg * num_output_g + outch0;
+            int gi = base_ch * channel_step + gy * psc(outw) + gx;
+
+            i8buffer_st1(top_blob_int8_data_1, gi, v.r);
+            if (outch0 + 1 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step, v.g);
+            if (outch0 + 2 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 2, v.b);
+            if (outch0 + 3 < num_output_g) i8buffer_st1(top_blob_int8_data_1, gi + channel_step * 3, v.a);
+        }
+    }
+    else
+    {
+        if (out_elempack == 4)
+        {
+            int gi = top_z * psc(outcstep) + gy * psc(outw) + gx;
+            buffer_st4(top_blob_data_4, gi, afpvec4(sumfp32));
+        }
+        else // out_elempack == 1
+        {
+            int channel_step = psc(outcstep) / 4;
+            int base_ch = gg * num_output_g + outch0;
+            int gi = base_ch * channel_step + gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data_1, gi, afp(sumfp32.r));
+            if (outch0 + 1 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step, afp(sumfp32.g));
+            if (outch0 + 2 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step * 2, afp(sumfp32.b));
+            if (outch0 + 3 < num_output_g) buffer_st1(top_blob_data_1, gi + channel_step * 3, afp(sumfp32.a));
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolutiondepthwise_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_int8.comp
new file mode 100644
index 000000000000..5b541f6f1888
--- /dev/null
+++ b/src/layer/vulkan/shader/convolutiondepthwise_int8.comp
@@ -0,0 +1,138 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int kernel_w = 1;
+layout(constant_id = 1) const int kernel_h = 1;
+layout(constant_id = 2) const int dilation_w = 1;
+layout(constant_id = 3) const int dilation_h = 1;
+layout(constant_id = 4) const int stride_w = 1;
+layout(constant_id = 5) const int stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int group = 1;
+layout(constant_id = 8) const int activation_type = 0;
+layout(constant_id = 9) const float activation_param_0 = 0;
+layout(constant_id = 10) const float activation_param_1 = 0;
+layout(constant_id = 11) const int use_int8_requantize = 0;
+
+#define shape_constant_id_offset 12
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 3) readonly buffer bias_blob { float bias_data[]; };
+layout(binding = 4) readonly buffer descales_blob { float descales_data[]; };
+layout(binding = 5) readonly buffer top_scales_blob { sfp top_scales_data[]; };
+layout(binding = 6) writeonly buffer top_blob_int8 { sint8 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int maxk = kernel_w * kernel_h;
+    const int maxk4 = (maxk + 3) / 4 * 4;
+    const int maxk4d4 = maxk4 / 4;
+
+    int sum = 0;
+
+    int k = 0;
+    for (; k + 3 < maxk; k += 4)
+    {
+        const ivec4 k4 = k + ivec4(0, 1, 2, 3);
+        const ivec4 ky4 = k4 / kernel_w;
+        const ivec4 kx4 = k4 - ky4 * kernel_w;
+        const ivec4 x4 = gx * stride_w + kx4 * dilation_w;
+        const ivec4 y4 = gy * stride_h + ky4 * dilation_h;
+
+        const int v0 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.r * psc(w) + x4.r);
+        const int v1 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.g * psc(w) + x4.g);
+        const int v2 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.b * psc(w) + x4.b);
+        const int v3 = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y4.a * psc(w) + x4.a);
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        const int vp = int((uint(v0) & 0xffu) | ((uint(v1) & 0xffu) << 8) | ((uint(v2) & 0xffu) << 16) | ((uint(v3) & 0xffu) << 24));
+        const int kvp = i8buffer_sm4(weight_data, gz * maxk4d4 + k / 4);
+        sum += dotPacked4x8EXT(vp, kvp);
+#else
+        const ivec4 v = ivec4(v0, v1, v2, v3);
+        const ivec4 kv = i8buffer_ld4(weight_data, gz * maxk4d4 + k / 4);
+        sum += v.r * kv.r + v.g * kv.g + v.b * kv.b + v.a * kv.a;
+#endif
+    }
+
+    for (; k < maxk; k++)
+    {
+        const int ky = k / kernel_w;
+        const int kx = k - ky * kernel_w;
+        const int x = gx * stride_w + kx * dilation_w;
+        const int y = gy * stride_h + ky * dilation_h;
+
+        const int v = i8buffer_ld1(bottom_blob_int8_data, gz * psc(cstep) + y * psc(w) + x);
+        const int kvp = i8buffer_sm4(weight_data, gz * maxk4d4 + k / 4);
+        const int ktail = k - k / 4 * 4;
+        const int kv = (kvp << (24 - ktail * 8)) >> 24;
+
+        sum += v * kv;
+    }
+
+    float sumfp32 = float(sum) * descales_data[gz];
+
+    if (bias_term == 1)
+    {
+        sumfp32 += bias_data[gz];
+    }
+
+    sumfp32 = float(activation_afp(afp(sumfp32), activation_type, activation_param_0, activation_param_1));
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    if (use_int8_requantize == 1)
+    {
+        sumfp32 *= buffer_ld1(top_scales_data, gz);
+        const int v = float2int8(sumfp32);
+        i8buffer_st1(top_blob_int8_data, gi, v);
+    }
+    else
+    {
+        buffer_st1(top_blob_data, gi, afp(sumfp32));
+    }
+}
diff --git a/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp b/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp
new file mode 100644
index 000000000000..9c9ddfde1367
--- /dev/null
+++ b/src/layer/vulkan/shader/convolutiondepthwise_pack4_int8.comp
@@ -0,0 +1,161 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int kernel_w = 1;
+layout(constant_id = 1) const int kernel_h = 1;
+layout(constant_id = 2) const int dilation_w = 1;
+layout(constant_id = 3) const int dilation_h = 1;
+layout(constant_id = 4) const int stride_w = 1;
+layout(constant_id = 5) const int stride_h = 1;
+layout(constant_id = 6) const int bias_term = 0;
+layout(constant_id = 7) const int group = 1;
+layout(constant_id = 8) const int activation_type = 0;
+layout(constant_id = 9) const float activation_param_0 = 0;
+layout(constant_id = 10) const float activation_param_1 = 0;
+layout(constant_id = 11) const int use_int8_requantize = 0;
+
+#define shape_constant_id_offset 12
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; };
+layout(binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };
+layout(binding = 4) readonly buffer descales_blob { vec4 descales_data[]; };
+layout(binding = 5) readonly buffer top_scales_blob { sfpvec4 top_scales_data[]; };
+layout(binding = 6) writeonly buffer top_blob_int8 { sint8vec4 top_blob_int8_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int maxk = kernel_w * kernel_h;
+    const int maxk4 = (maxk + 3) / 4 * 4;
+
+    ivec4 sum = ivec4(0);
+
+    int k = 0;
+    for (; k + 3 < maxk; k += 4)
+    {
+        const ivec4 k4 = k + ivec4(0, 1, 2, 3);
+        const ivec4 ky4 = k4 / kernel_w;
+        const ivec4 kx4 = k4 - ky4 * kernel_w;
+        const ivec4 x4 = gx * stride_w + kx4 * dilation_w;
+        const ivec4 y4 = gy * stride_h + ky4 * dilation_h;
+
+        const int v0p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.r * psc(w) + x4.r);
+        const int v1p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.g * psc(w) + x4.g);
+        const int v2p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.b * psc(w) + x4.b);
+        const int v3p = i8buffer_sm4(bottom_blob_int8_data, gz * psc(cstep) + y4.a * psc(w) + x4.a);
+
+        const ivec4 kp = weight_data[gz * maxk4 / 4 + k / 4];
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        const uint v0u = uint(v0p);
+        const uint v1u = uint(v1p);
+        const uint v2u = uint(v2p);
+        const uint v3u = uint(v3p);
+
+        const int vp0 = int((v0u & 0x000000ffu) | ((v1u & 0x000000ffu) << 8) | ((v2u & 0x000000ffu) << 16) | ((v3u & 0x000000ffu) << 24));
+        const int vp1 = int(((v0u & 0x0000ff00u) >> 8) | (v1u & 0x0000ff00u) | ((v2u & 0x0000ff00u) << 8) | ((v3u & 0x0000ff00u) << 16));
+        const int vp2 = int(((v0u & 0x00ff0000u) >> 16) | ((v1u & 0x00ff0000u) >> 8) | (v2u & 0x00ff0000u) | ((v3u & 0x00ff0000u) << 8));
+        const int vp3 = int(((v0u & 0xff000000u) >> 24) | ((v1u & 0xff000000u) >> 16) | ((v2u & 0xff000000u) >> 8) | (v3u & 0xff000000u));
+
+        sum.r += dotPacked4x8EXT(vp0, kp.r);
+        sum.g += dotPacked4x8EXT(vp1, kp.g);
+        sum.b += dotPacked4x8EXT(vp2, kp.b);
+        sum.a += dotPacked4x8EXT(vp3, kp.a);
+#else
+        const ivec4 v0 = unpackInt4x8(v0p);
+        const ivec4 v1 = unpackInt4x8(v1p);
+        const ivec4 v2 = unpackInt4x8(v2p);
+        const ivec4 v3 = unpackInt4x8(v3p);
+
+        const ivec4 k0 = unpackInt4x8(kp.r);
+        const ivec4 k1 = unpackInt4x8(kp.g);
+        const ivec4 k2 = unpackInt4x8(kp.b);
+        const ivec4 k3 = unpackInt4x8(kp.a);
+
+        sum.r += v0.r * k0.r + v1.r * k0.g + v2.r * k0.b + v3.r * k0.a;
+        sum.g += v0.g * k1.r + v1.g * k1.g + v2.g * k1.b + v3.g * k1.a;
+        sum.b += v0.b * k2.r + v1.b * k2.g + v2.b * k2.b + v3.b * k2.a;
+        sum.a += v0.a * k3.r + v1.a * k3.g + v2.a * k3.b + v3.a * k3.a;
+#endif
+    }
+
+    for (; k < maxk; k++)
+    {
+        const int ky = k / kernel_w;
+        const int kx = k - ky * kernel_w;
+        const int x = gx * stride_w + kx * dilation_w;
+        const int y = gy * stride_h + ky * dilation_h;
+
+        const ivec4 v = i8buffer_ld4(bottom_blob_int8_data, gz * psc(cstep) + y * psc(w) + x);
+        const ivec4 kp = weight_data[gz * maxk4 / 4 + k / 4];
+        const int ktail = k - k / 4 * 4;
+        const ivec4 kv = (kp << ivec4(24 - ktail * 8)) >> ivec4(24);
+
+        sum += v * kv;
+    }
+
+    vec4 sumfp32 = vec4(sum) * descales_data[gz];
+
+    if (bias_term == 1)
+    {
+        sumfp32 += bias_data[gz];
+    }
+
+    sumfp32 = vec4(activation_afpvec4(afpvec4(sumfp32), activation_type, activation_param_0, activation_param_1));
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    if (use_int8_requantize == 1)
+    {
+        sumfp32 *= vec4(buffer_ld4(top_scales_data, gz));
+        const ivec4 v = float2int8vec4(sumfp32);
+        i8buffer_st4(top_blob_int8_data, gi, v);
+    }
+    else
+    {
+        buffer_st4(top_blob_data, gi, afpvec4(sumfp32));
+    }
+}
diff --git a/src/layer/vulkan/shader/flatten_int8.comp b/src/layer/vulkan/shader/flatten_int8.comp
new file mode 100644
index 000000000000..110b84fb1452
--- /dev/null
+++ b/src/layer/vulkan/shader/flatten_int8.comp
@@ -0,0 +1,55 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#define shape_constant_id_offset 0
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    int size = psc(w) * psc(h);
+
+    int z = gx / size;
+    int y = gx % size / psc(w);
+    int x = gx % size % psc(w);
+
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    i8buffer_st1(top_blob_data, gx, i8buffer_ld1(bottom_blob_data, v_offset));
+}
diff --git a/src/layer/vulkan/shader/flatten_pack1to4_int8.comp b/src/layer/vulkan/shader/flatten_pack1to4_int8.comp
new file mode 100644
index 000000000000..86fabd3f5399
--- /dev/null
+++ b/src/layer/vulkan/shader/flatten_pack1to4_int8.comp
@@ -0,0 +1,74 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#define shape_constant_id_offset 0
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);
+
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = y4 * psc(w) + x4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
+    }
+
+    ivec4 v = ivec4(i8buffer_ld1(bottom_blob_data, v_offset.r),
+                    i8buffer_ld1(bottom_blob_data, v_offset.g),
+                    i8buffer_ld1(bottom_blob_data, v_offset.b),
+                    i8buffer_ld1(bottom_blob_data, v_offset.a));
+
+    i8buffer_st4(top_blob_data, gx, v);
+}
diff --git a/src/layer/vulkan/shader/flatten_pack4_int8.comp b/src/layer/vulkan/shader/flatten_pack4_int8.comp
new file mode 100644
index 000000000000..88b02ac01be1
--- /dev/null
+++ b/src/layer/vulkan/shader/flatten_pack4_int8.comp
@@ -0,0 +1,69 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#define shape_constant_id_offset 0
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);
+
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+    }
+
+    i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset);
+}
diff --git a/src/layer/vulkan/shader/gemm_int8.comp b/src/layer/vulkan/shader/gemm_int8.comp
new file mode 100644
index 000000000000..34d552991367
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_int8.comp
@@ -0,0 +1,462 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#define LOCAL_MEMORY_UNROLL_INCH 8
+
+layout(constant_id = 0) const float alpha = 1.f;
+layout(constant_id = 1) const float beta = 1.f;
+layout(constant_id = 2) const int constantC = 0;
+layout(constant_id = 3) const int constant_broadcast_type_C = 0;
+layout(constant_id = 4) const int output_transpose = 0;
+
+layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 1) readonly buffer A_int8_blob { sint8 A_int8_data[]; };
+layout(binding = 2) readonly buffer B_int8_blob { sint8 B_int8_data[]; };
+layout(binding = 3) readonly buffer C_blob { sfp C_blob_data[]; };
+layout(binding = 4) readonly buffer A_descales_blob { float A_descales_data[]; };
+layout(binding = 5) readonly buffer B_descale_blob { float B_descale_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int M;
+    int N;
+    int K;
+    int broadcast_type_C;
+    int outhstep;
+} p;
+
+#if NCNN_shader_local_memory
+// avoid bank conflict
+#define PAD 1
+shared int tmp_a[8][LOCAL_MEMORY_UNROLL_INCH + PAD];
+shared int tmp_b[8][LOCAL_MEMORY_UNROLL_INCH + PAD];
+#endif
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+    const uint gz = gl_GlobalInvocationID.z;
+
+#if !NCNN_shader_local_memory
+    if (gx * 4 >= p.N || gy * 4 >= p.M || gz >= 1)
+        return;
+#else
+    if (gz >= 1)
+        return;
+#endif
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+
+#if NCNN_shader_local_memory
+    const uint lx = gl_LocalInvocationID.x;
+    const uint ly = gl_LocalInvocationID.y;
+
+    for (int k = 0; k < p.K; k += LOCAL_MEMORY_UNROLL_INCH)
+    {
+        {
+            ivec4 a = ivec4(0);
+            const int ak = k + int(lx / 4) * 4;
+            const uint ay = gy * 4 + lx % 4;
+            if (ay < p.M)
+            {
+                const int ai = int(ay) * p.K + ak;
+                if (ak + 0 < p.K) a.r = i8buffer_ld1(A_int8_data, ai + 0);
+                if (ak + 1 < p.K) a.g = i8buffer_ld1(A_int8_data, ai + 1);
+                if (ak + 2 < p.K) a.b = i8buffer_ld1(A_int8_data, ai + 2);
+                if (ak + 3 < p.K) a.a = i8buffer_ld1(A_int8_data, ai + 3);
+            }
+
+            tmp_a[ly][lx] = packInt4x8(a);
+
+            ivec4 b = ivec4(0);
+            const int bk = k + int(ly / 4) * 4;
+            const uint bx = gx * 4 + ly % 4;
+            if (bx < p.N)
+            {
+                const int bi = int(bx) * p.K + bk;
+                if (bk + 0 < p.K) b.r = i8buffer_ld1(B_int8_data, bi + 0);
+                if (bk + 1 < p.K) b.g = i8buffer_ld1(B_int8_data, bi + 1);
+                if (bk + 2 < p.K) b.b = i8buffer_ld1(B_int8_data, bi + 2);
+                if (bk + 3 < p.K) b.a = i8buffer_ld1(B_int8_data, bi + 3);
+            }
+
+            tmp_b[lx][ly] = packInt4x8(b);
+        }
+
+        barrier();
+
+        for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH / 4; k4++)
+        {
+            const int kk = k4 * 4;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            const int a0 = tmp_a[ly][kk + 0];
+            const int a1 = tmp_a[ly][kk + 1];
+            const int a2 = tmp_a[ly][kk + 2];
+            const int a3 = tmp_a[ly][kk + 3];
+
+            const int b0 = tmp_b[lx][kk + 0];
+            const int b1 = tmp_b[lx][kk + 1];
+            const int b2 = tmp_b[lx][kk + 2];
+            const int b3 = tmp_b[lx][kk + 3];
+
+            sum0.r += dotPacked4x8EXT(a0, b0);
+            sum0.g += dotPacked4x8EXT(a0, b1);
+            sum0.b += dotPacked4x8EXT(a0, b2);
+            sum0.a += dotPacked4x8EXT(a0, b3);
+            sum1.r += dotPacked4x8EXT(a1, b0);
+            sum1.g += dotPacked4x8EXT(a1, b1);
+            sum1.b += dotPacked4x8EXT(a1, b2);
+            sum1.a += dotPacked4x8EXT(a1, b3);
+            sum2.r += dotPacked4x8EXT(a2, b0);
+            sum2.g += dotPacked4x8EXT(a2, b1);
+            sum2.b += dotPacked4x8EXT(a2, b2);
+            sum2.a += dotPacked4x8EXT(a2, b3);
+            sum3.r += dotPacked4x8EXT(a3, b0);
+            sum3.g += dotPacked4x8EXT(a3, b1);
+            sum3.b += dotPacked4x8EXT(a3, b2);
+            sum3.a += dotPacked4x8EXT(a3, b3);
+#else
+            const ivec4 a0 = unpackInt4x8(tmp_a[ly][kk + 0]);
+            const ivec4 a1 = unpackInt4x8(tmp_a[ly][kk + 1]);
+            const ivec4 a2 = unpackInt4x8(tmp_a[ly][kk + 2]);
+            const ivec4 a3 = unpackInt4x8(tmp_a[ly][kk + 3]);
+
+            const ivec4 b0 = unpackInt4x8(tmp_b[lx][kk + 0]);
+            const ivec4 b1 = unpackInt4x8(tmp_b[lx][kk + 1]);
+            const ivec4 b2 = unpackInt4x8(tmp_b[lx][kk + 2]);
+            const ivec4 b3 = unpackInt4x8(tmp_b[lx][kk + 3]);
+
+            sum0.r += a0.r * b0.r + a0.g * b0.g + a0.b * b0.b + a0.a * b0.a;
+            sum0.g += a0.r * b1.r + a0.g * b1.g + a0.b * b1.b + a0.a * b1.a;
+            sum0.b += a0.r * b2.r + a0.g * b2.g + a0.b * b2.b + a0.a * b2.a;
+            sum0.a += a0.r * b3.r + a0.g * b3.g + a0.b * b3.b + a0.a * b3.a;
+            sum1.r += a1.r * b0.r + a1.g * b0.g + a1.b * b0.b + a1.a * b0.a;
+            sum1.g += a1.r * b1.r + a1.g * b1.g + a1.b * b1.b + a1.a * b1.a;
+            sum1.b += a1.r * b2.r + a1.g * b2.g + a1.b * b2.b + a1.a * b2.a;
+            sum1.a += a1.r * b3.r + a1.g * b3.g + a1.b * b3.b + a1.a * b3.a;
+            sum2.r += a2.r * b0.r + a2.g * b0.g + a2.b * b0.b + a2.a * b0.a;
+            sum2.g += a2.r * b1.r + a2.g * b1.g + a2.b * b1.b + a2.a * b1.a;
+            sum2.b += a2.r * b2.r + a2.g * b2.g + a2.b * b2.b + a2.a * b2.a;
+            sum2.a += a2.r * b3.r + a2.g * b3.g + a2.b * b3.b + a2.a * b3.a;
+            sum3.r += a3.r * b0.r + a3.g * b0.g + a3.b * b0.b + a3.a * b0.a;
+            sum3.g += a3.r * b1.r + a3.g * b1.g + a3.b * b1.b + a3.a * b1.a;
+            sum3.b += a3.r * b2.r + a3.g * b2.g + a3.b * b2.b + a3.a * b2.a;
+            sum3.a += a3.r * b3.r + a3.g * b3.g + a3.b * b3.b + a3.a * b3.a;
+#endif
+        }
+
+        barrier();
+    }
+
+    if (gx * 4 >= p.N || gy * 4 >= p.M)
+        return;
+#else
+    for (int k = 0; k < p.K; k += 4)
+    {
+        const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3);
+        const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3);
+
+        ivec4 a0 = ivec4(0);
+        ivec4 a1 = ivec4(0);
+        ivec4 a2 = ivec4(0);
+        ivec4 a3 = ivec4(0);
+
+        if (gy4.r < p.M)
+        {
+            const int ai = int(gy4.r) * p.K + k;
+            a0.r = i8buffer_ld1(A_int8_data, ai + 0);
+            if (k + 1 < p.K) a0.g = i8buffer_ld1(A_int8_data, ai + 1);
+            if (k + 2 < p.K) a0.b = i8buffer_ld1(A_int8_data, ai + 2);
+            if (k + 3 < p.K) a0.a = i8buffer_ld1(A_int8_data, ai + 3);
+        }
+        if (gy4.g < p.M)
+        {
+            const int ai = int(gy4.g) * p.K + k;
+            a1.r = i8buffer_ld1(A_int8_data, ai + 0);
+            if (k + 1 < p.K) a1.g = i8buffer_ld1(A_int8_data, ai + 1);
+            if (k + 2 < p.K) a1.b = i8buffer_ld1(A_int8_data, ai + 2);
+            if (k + 3 < p.K) a1.a = i8buffer_ld1(A_int8_data, ai + 3);
+        }
+        if (gy4.b < p.M)
+        {
+            const int ai = int(gy4.b) * p.K + k;
+            a2.r = i8buffer_ld1(A_int8_data, ai + 0);
+            if (k + 1 < p.K) a2.g = i8buffer_ld1(A_int8_data, ai + 1);
+            if (k + 2 < p.K) a2.b = i8buffer_ld1(A_int8_data, ai + 2);
+            if (k + 3 < p.K) a2.a = i8buffer_ld1(A_int8_data, ai + 3);
+        }
+        if (gy4.a < p.M)
+        {
+            const int ai = int(gy4.a) * p.K + k;
+            a3.r = i8buffer_ld1(A_int8_data, ai + 0);
+            if (k + 1 < p.K) a3.g = i8buffer_ld1(A_int8_data, ai + 1);
+            if (k + 2 < p.K) a3.b = i8buffer_ld1(A_int8_data, ai + 2);
+            if (k + 3 < p.K) a3.a = i8buffer_ld1(A_int8_data, ai + 3);
+        }
+
+        ivec4 b0 = ivec4(0);
+        ivec4 b1 = ivec4(0);
+        ivec4 b2 = ivec4(0);
+        ivec4 b3 = ivec4(0);
+
+        if (gx4.r < p.N)
+        {
+            const int bi = int(gx4.r) * p.K + k;
+            b0.r = i8buffer_ld1(B_int8_data, bi + 0);
+            if (k + 1 < p.K) b0.g = i8buffer_ld1(B_int8_data, bi + 1);
+            if (k + 2 < p.K) b0.b = i8buffer_ld1(B_int8_data, bi + 2);
+            if (k + 3 < p.K) b0.a = i8buffer_ld1(B_int8_data, bi + 3);
+        }
+        if (gx4.g < p.N)
+        {
+            const int bi = int(gx4.g) * p.K + k;
+            b1.r = i8buffer_ld1(B_int8_data, bi + 0);
+            if (k + 1 < p.K) b1.g = i8buffer_ld1(B_int8_data, bi + 1);
+            if (k + 2 < p.K) b1.b = i8buffer_ld1(B_int8_data, bi + 2);
+            if (k + 3 < p.K) b1.a = i8buffer_ld1(B_int8_data, bi + 3);
+        }
+        if (gx4.b < p.N)
+        {
+            const int bi = int(gx4.b) * p.K + k;
+            b2.r = i8buffer_ld1(B_int8_data, bi + 0);
+            if (k + 1 < p.K) b2.g = i8buffer_ld1(B_int8_data, bi + 1);
+            if (k + 2 < p.K) b2.b = i8buffer_ld1(B_int8_data, bi + 2);
+            if (k + 3 < p.K) b2.a = i8buffer_ld1(B_int8_data, bi + 3);
+        }
+        if (gx4.a < p.N)
+        {
+            const int bi = int(gx4.a) * p.K + k;
+            b3.r = i8buffer_ld1(B_int8_data, bi + 0);
+            if (k + 1 < p.K) b3.g = i8buffer_ld1(B_int8_data, bi + 1);
+            if (k + 2 < p.K) b3.b = i8buffer_ld1(B_int8_data, bi + 2);
+            if (k + 3 < p.K) b3.a = i8buffer_ld1(B_int8_data, bi + 3);
+        }
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        const int a0_packed = packInt4x8(a0);
+        const int a1_packed = packInt4x8(a1);
+        const int a2_packed = packInt4x8(a2);
+        const int a3_packed = packInt4x8(a3);
+
+        const int b0_packed = packInt4x8(b0);
+        const int b1_packed = packInt4x8(b1);
+        const int b2_packed = packInt4x8(b2);
+        const int b3_packed = packInt4x8(b3);
+
+        sum0.r += dotPacked4x8EXT(a0_packed, b0_packed);
+        sum0.g += dotPacked4x8EXT(a0_packed, b1_packed);
+        sum0.b += dotPacked4x8EXT(a0_packed, b2_packed);
+        sum0.a += dotPacked4x8EXT(a0_packed, b3_packed);
+        sum1.r += dotPacked4x8EXT(a1_packed, b0_packed);
+        sum1.g += dotPacked4x8EXT(a1_packed, b1_packed);
+        sum1.b += dotPacked4x8EXT(a1_packed, b2_packed);
+        sum1.a += dotPacked4x8EXT(a1_packed, b3_packed);
+        sum2.r += dotPacked4x8EXT(a2_packed, b0_packed);
+        sum2.g += dotPacked4x8EXT(a2_packed, b1_packed);
+        sum2.b += dotPacked4x8EXT(a2_packed, b2_packed);
+        sum2.a += dotPacked4x8EXT(a2_packed, b3_packed);
+        sum3.r += dotPacked4x8EXT(a3_packed, b0_packed);
+        sum3.g += dotPacked4x8EXT(a3_packed, b1_packed);
+        sum3.b += dotPacked4x8EXT(a3_packed, b2_packed);
+        sum3.a += dotPacked4x8EXT(a3_packed, b3_packed);
+#else
+        sum0.r += a0.r * b0.r + a0.g * b0.g + a0.b * b0.b + a0.a * b0.a;
+        sum0.g += a0.r * b1.r + a0.g * b1.g + a0.b * b1.b + a0.a * b1.a;
+        sum0.b += a0.r * b2.r + a0.g * b2.g + a0.b * b2.b + a0.a * b2.a;
+        sum0.a += a0.r * b3.r + a0.g * b3.g + a0.b * b3.b + a0.a * b3.a;
+        sum1.r += a1.r * b0.r + a1.g * b0.g + a1.b * b0.b + a1.a * b0.a;
+        sum1.g += a1.r * b1.r + a1.g * b1.g + a1.b * b1.b + a1.a * b1.a;
+        sum1.b += a1.r * b2.r + a1.g * b2.g + a1.b * b2.b + a1.a * b2.a;
+        sum1.a += a1.r * b3.r + a1.g * b3.g + a1.b * b3.b + a1.a * b3.a;
+        sum2.r += a2.r * b0.r + a2.g * b0.g + a2.b * b0.b + a2.a * b0.a;
+        sum2.g += a2.r * b1.r + a2.g * b1.g + a2.b * b1.b + a2.a * b1.a;
+        sum2.b += a2.r * b2.r + a2.g * b2.g + a2.b * b2.b + a2.a * b2.a;
+        sum2.a += a2.r * b3.r + a2.g * b3.g + a2.b * b3.b + a2.a * b3.a;
+        sum3.r += a3.r * b0.r + a3.g * b0.g + a3.b * b0.b + a3.a * b0.a;
+        sum3.g += a3.r * b1.r + a3.g * b1.g + a3.b * b1.b + a3.a * b1.a;
+        sum3.b += a3.r * b2.r + a3.g * b2.g + a3.b * b2.b + a3.a * b2.a;
+        sum3.a += a3.r * b3.r + a3.g * b3.g + a3.b * b3.b + a3.a * b3.a;
+#endif
+    }
+#endif
+
+    const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3);
+    const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3);
+
+    const float B_descale = B_descale_data[0];
+
+    float descale0 = 0.f;
+    float descale1 = 0.f;
+    float descale2 = 0.f;
+    float descale3 = 0.f;
+
+    if (gy4.r < p.M)
+    {
+        descale0 = A_descales_data[gy4.r] * B_descale;
+    }
+    if (gy4.g < p.M)
+    {
+        descale1 = A_descales_data[gy4.g] * B_descale;
+    }
+    if (gy4.b < p.M)
+    {
+        descale2 = A_descales_data[gy4.b] * B_descale;
+    }
+    if (gy4.a < p.M)
+    {
+        descale3 = A_descales_data[gy4.a] * B_descale;
+    }
+
+    vec4 sumfp0 = vec4(sum0) * descale0;
+    vec4 sumfp1 = vec4(sum1) * descale1;
+    vec4 sumfp2 = vec4(sum2) * descale2;
+    vec4 sumfp3 = vec4(sum3) * descale3;
+
+    const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C;
+    if (broadcast_type_C != -1)
+    {
+        if (broadcast_type_C == 0)
+        {
+            const float c = float(buffer_ld1(C_blob_data, 0)) * beta;
+            sumfp0 += c;
+            sumfp1 += c;
+            sumfp2 += c;
+            sumfp3 += c;
+        }
+        if (broadcast_type_C == 1 || broadcast_type_C == 2)
+        {
+            if (gy4.r < p.M) sumfp0 += float(buffer_ld1(C_blob_data, gy4.r)) * beta;
+            if (gy4.g < p.M) sumfp1 += float(buffer_ld1(C_blob_data, gy4.g)) * beta;
+            if (gy4.b < p.M) sumfp2 += float(buffer_ld1(C_blob_data, gy4.b)) * beta;
+            if (gy4.a < p.M) sumfp3 += float(buffer_ld1(C_blob_data, gy4.a)) * beta;
+        }
+        if (broadcast_type_C == 3)
+        {
+            if (gy4.r < p.M)
+            {
+                const uint ci = gy4.r * uint(p.N) + gx * 4;
+                sumfp0.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta;
+                if (gx4.g < p.N) sumfp0.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta;
+                if (gx4.b < p.N) sumfp0.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta;
+                if (gx4.a < p.N) sumfp0.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta;
+            }
+            if (gy4.g < p.M)
+            {
+                const uint ci = gy4.g * uint(p.N) + gx * 4;
+                sumfp1.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta;
+                if (gx4.g < p.N) sumfp1.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta;
+                if (gx4.b < p.N) sumfp1.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta;
+                if (gx4.a < p.N) sumfp1.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta;
+            }
+            if (gy4.b < p.M)
+            {
+                const uint ci = gy4.b * uint(p.N) + gx * 4;
+                sumfp2.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta;
+                if (gx4.g < p.N) sumfp2.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta;
+                if (gx4.b < p.N) sumfp2.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta;
+                if (gx4.a < p.N) sumfp2.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta;
+            }
+            if (gy4.a < p.M)
+            {
+                const uint ci = gy4.a * uint(p.N) + gx * 4;
+                sumfp3.r += float(buffer_ld1(C_blob_data, ci + 0)) * beta;
+                if (gx4.g < p.N) sumfp3.g += float(buffer_ld1(C_blob_data, ci + 1)) * beta;
+                if (gx4.b < p.N) sumfp3.b += float(buffer_ld1(C_blob_data, ci + 2)) * beta;
+                if (gx4.a < p.N) sumfp3.a += float(buffer_ld1(C_blob_data, ci + 3)) * beta;
+            }
+        }
+        if (broadcast_type_C == 4)
+        {
+            vec4 c = vec4(0.f);
+            c.r = float(buffer_ld1(C_blob_data, gx4.r));
+            if (gx4.g < p.N) c.g = float(buffer_ld1(C_blob_data, gx4.g));
+            if (gx4.b < p.N) c.b = float(buffer_ld1(C_blob_data, gx4.b));
+            if (gx4.a < p.N) c.a = float(buffer_ld1(C_blob_data, gx4.a));
+
+            sumfp0 += c * beta;
+            sumfp1 += c * beta;
+            sumfp2 += c * beta;
+            sumfp3 += c * beta;
+        }
+    }
+
+    if (alpha != 1.f)
+    {
+        sumfp0 *= alpha;
+        sumfp1 *= alpha;
+        sumfp2 *= alpha;
+        sumfp3 *= alpha;
+    }
+
+    if (output_transpose == 1)
+    {
+        const uvec4 gi4 = gx4 * uint(p.outhstep) + gy * 4;
+
+        buffer_st1(top_blob_data, gi4.r, sumfp0.r);
+        if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.r + 1, sumfp1.r);
+        if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.r + 2, sumfp2.r);
+        if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.r + 3, sumfp3.r);
+        if (gx4.g < p.N)
+        {
+            buffer_st1(top_blob_data, gi4.g, sumfp0.g);
+            if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.g + 1, sumfp1.g);
+            if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.g + 2, sumfp2.g);
+            if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.g + 3, sumfp3.g);
+        }
+        if (gx4.b < p.N)
+        {
+            buffer_st1(top_blob_data, gi4.b, sumfp0.b);
+            if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.b + 1, sumfp1.b);
+            if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.b + 2, sumfp2.b);
+            if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.b + 3, sumfp3.b);
+        }
+        if (gx4.a < p.N)
+        {
+            buffer_st1(top_blob_data, gi4.a, sumfp0.a);
+            if (gy4.g < p.M) buffer_st1(top_blob_data, gi4.a + 1, sumfp1.a);
+            if (gy4.b < p.M) buffer_st1(top_blob_data, gi4.a + 2, sumfp2.a);
+            if (gy4.a < p.M) buffer_st1(top_blob_data, gi4.a + 3, sumfp3.a);
+        }
+    }
+    else
+    {
+        const uvec4 gi4 = gy4 * uint(p.outhstep) + gx * 4;
+
+        buffer_st1(top_blob_data, gi4.r, sumfp0.r);
+        if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.r + 1, sumfp0.g);
+        if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.r + 2, sumfp0.b);
+        if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.r + 3, sumfp0.a);
+        if (gy4.g < p.M)
+        {
+            buffer_st1(top_blob_data, gi4.g, sumfp1.r);
+            if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.g + 1, sumfp1.g);
+            if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.g + 2, sumfp1.b);
+            if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.g + 3, sumfp1.a);
+        }
+        if (gy4.b < p.M)
+        {
+            buffer_st1(top_blob_data, gi4.b, sumfp2.r);
+            if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.b + 1, sumfp2.g);
+            if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.b + 2, sumfp2.b);
+            if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.b + 3, sumfp2.a);
+        }
+        if (gy4.a < p.M)
+        {
+            buffer_st1(top_blob_data, gi4.a, sumfp3.r);
+            if (gx4.g < p.N) buffer_st1(top_blob_data, gi4.a + 1, sumfp3.g);
+            if (gx4.b < p.N) buffer_st1(top_blob_data, gi4.a + 2, sumfp3.b);
+            if (gx4.a < p.N) buffer_st1(top_blob_data, gi4.a + 3, sumfp3.a);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/gemm_int8_cm.comp b/src/layer/vulkan/shader/gemm_int8_cm.comp
new file mode 100644
index 000000000000..26b0fd4a9275
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_int8_cm.comp
@@ -0,0 +1,1087 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_basic : require
+
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#if ncnn_VK_KHR_cooperative_matrix
+#extension GL_KHR_cooperative_matrix : require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix : require
+#extension GL_NV_integer_cooperative_matrix : require
+#endif
+
+layout(constant_id = 0) const float alpha = 1.f;
+layout(constant_id = 1) const float beta = 1.f;
+layout(constant_id = 2) const int constantA = 0;
+layout(constant_id = 3) const int constantB = 0;
+layout(constant_id = 4) const int constantC = 0;
+layout(constant_id = 5) const int constant_broadcast_type_C = 0;
+layout(constant_id = 6) const int output_transpose = 0;
+layout(constant_id = 7) const uint GM = 0;
+layout(constant_id = 8) const uint GN = 0;
+layout(constant_id = 9) const uint GK = 0;
+layout(constant_id = 10) const uint out_elempack = 0;
+
+layout(constant_id = 11 + 0) const uint M = 1;
+layout(constant_id = 11 + 1) const uint N = 1;
+layout(constant_id = 11 + 2) const uint K = 1;
+layout(constant_id = 11 + 3) const uint subgroup_size = 32;
+layout(constant_id = 11 + 4) const uint UNROLL_SG_M = 2;
+layout(constant_id = 11 + 5) const uint UNROLL_SG_N = 2;
+layout(constant_id = 11 + 6) const uint UNROLL_SG_K = 2;
+layout(constant_id = 11 + 7) const uint UNROLL_WG_M = 2;
+layout(constant_id = 11 + 8) const uint UNROLL_WG_N = 2;
+
+layout(binding = 0) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout(binding = 1) readonly buffer A_int8_blob { sint8vec4 A_int8_data[]; };
+layout(binding = 2) readonly buffer B_int8_blob { sint8vec4 B_int8_data[]; };
+layout(binding = 3) readonly buffer C_blob { sfp C_blob_data[]; };
+layout(binding = 4) readonly buffer A_descales_blob { float A_descales_data[]; };
+layout(binding = 5) readonly buffer B_descale_blob { float B_descale_data[]; };
+layout(binding = 6) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; };
+
+layout(push_constant) uniform parameter
+{
+    uint GM;
+    uint GN;
+    uint GK;
+    int broadcast_type_C;
+    uint outhstep;
+    uint out_elempack;
+} p;
+
+// cannot alias output with a and b
+// cm store may happen while another subgroup is loading
+const uint Md4 = M / 4;
+const uint Nd4 = N / 4;
+const uint Kd4 = K / 4;
+
+// avoid bank conflict
+#if ncnn_VK_KHR_cooperative_matrix
+#define PAD 1
+#elif ncnn_VK_NV_cooperative_matrix
+// fixme: pad causes incorrect result on old driver
+#define PAD 0
+#endif
+
+const uint Nd4p = Nd4 + PAD;
+const uint Kd4p = Kd4 + PAD;
+
+shared int tmp_a[UNROLL_WG_M][UNROLL_SG_K * UNROLL_SG_M * M * Kd4p];
+shared int tmp_b[UNROLL_WG_N][UNROLL_SG_K * UNROLL_SG_N * K * Nd4p];
+shared ivec4 tmp_o[UNROLL_WG_N * UNROLL_WG_M][UNROLL_SG_N * UNROLL_SG_M * M * Nd4];
+
+void main()
+{
+    // neither gl_SubgroupSize nor gl_WorkGroupSize.x is a constant
+    const uint local_size = subgroup_size * UNROLL_WG_M * UNROLL_WG_N;
+
+    // [ WG_UN * WG_UM * [ SG_UN * SG_UM * subgroup ] ]
+
+    //                     <----WG_UN---->
+    //       +---N--+-SG_UN+------+------+
+    //       |      |      |      |XXXXXX|
+    //       M             |       XXXX<----coopmat<M,N>
+    //       |      |      |      |XXXXXX|
+    //       +-- --SG0-- --+-- --SG2-- --+
+    //       |      |      |      |      |
+    //      SG_UM          |             |
+    //       |      |      |      |      |
+    //    ^  +------+--WORKGROUP--+------+
+    //    |  |      |      |      |      |
+    //    |  |             |             |
+    //    |  |      |      |      |      |
+    //  WG_UM+-- --SG1-- --+-- --SG3-- --+
+    //    |  |      |      |      |      |
+    //    |  |             |             |
+    //    |  |      |      |      |      |
+    //    v  +------+------+------+------+
+    //
+
+    const uint wgi = gl_WorkGroupID.x;
+    const uint sgi = gl_SubgroupID;
+
+    const uint wgmm = (psc(GM) + M * UNROLL_SG_M * UNROLL_WG_M - 1) / (M * UNROLL_SG_M * UNROLL_WG_M);
+    const uint wgnn = (psc(GN) + N * UNROLL_SG_N * UNROLL_WG_N - 1) / (N * UNROLL_SG_N * UNROLL_WG_N);
+
+    const uint wgmi = wgi / wgnn;
+    const uint wgni = wgi % wgnn;
+
+    const uint sgmi = sgi / UNROLL_WG_N;
+    const uint sgni = sgi % UNROLL_WG_N;
+
+    const uint kk = (psc(GK) + K - 1) / K;
+
+    if (wgmi >= wgmm)
+        return;
+
+    const uint si = gl_SubgroupInvocationID;
+
+    const uint ni = (wgni * UNROLL_WG_N + sgni) * UNROLL_SG_N;
+    const uint mi = (wgmi * UNROLL_WG_M + sgmi) * UNROLL_SG_M;
+
+#if ncnn_VK_KHR_cooperative_matrix
+    coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator> sum[UNROLL_SG_N][UNROLL_SG_M];
+#elif ncnn_VK_NV_cooperative_matrix
+    icoopmatNV<32, gl_ScopeSubgroup, M, N> sum[UNROLL_SG_N][UNROLL_SG_M];
+#endif
+
+    {
+        [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                sum[zn][zm] = coopmat<int, gl_ScopeSubgroup, M, N, gl_MatrixUseAccumulator>(0);
+#elif ncnn_VK_NV_cooperative_matrix
+                sum[zn][zm] = icoopmatNV<32, gl_ScopeSubgroup, M, N>(0);
+#endif
+            }
+        }
+    }
+
+    uint k = 0;
+
+    if (kk >= UNROLL_SG_K * 2)
+    {
+        // local stack and shared memory ping-pong
+
+        // prefetch
+        int prefetch_tmp_a[(UNROLL_SG_M * UNROLL_SG_K * M * Kd4 + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N)];
+        int prefetch_tmp_b[(UNROLL_SG_N * UNROLL_SG_K * K * Nd4 + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M)];
+
+        // prefetch the very first
+        {
+            const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M;
+            const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K;
+            const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+            [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK)
+                {
+                    int v = 0;
+
+                    if (constantA == 1)
+                    {
+                        const uint kk_full = kk / UNROLL_SG_K;
+
+                        if (0 < kk_full)
+                        {
+                            const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + sgmi * M_Kd4_USGM_USGK;
+                            v = i8buffer_sm4(A_int8_data, a_offset + siq);
+                        }
+                        else
+                        {
+                            const uint zk = siq / M_Kd4_USGM;
+                            const uint sij = siq % M_Kd4_USGM;
+
+                            if (zk < kk)
+                            {
+                                const uint tail_offset = (wgmi * kk * UNROLL_WG_M + sgmi) * M_Kd4_USGM;
+                                v = i8buffer_sm4(A_int8_data, tail_offset + sij);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        const uint zk = siq / M_Kd4_USGM;
+                        const uint zmij = siq % M_Kd4_USGM;
+                        const uint zm = zmij / (M * Kd4);
+                        const uint ij = zmij % (M * Kd4);
+                        const uint i = ij / Kd4;
+                        const uint j = ij % Kd4;
+
+                        const uint gm = (mi + zm) * M + i;
+                        const uint gk = zk * K + j * 4;
+
+                        if (gm < psc(GM) && gk < psc(GK))
+                        {
+                            if (psc(GK) % 4 == 0)
+                            {
+                                v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4);
+                            }
+                            else
+                            {
+                                const uint ai = gm * psc(GK) + gk;
+                                const uint aim4 = ai % 4;
+                                const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4);
+                                ivec4 v1 = ivec4(0);
+                                if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1);
+                                ivec4 v4;
+                                if (aim4 == 0)
+                                {
+                                    v4 = v0;
+                                }
+                                else if (aim4 == 1)
+                                {
+                                    v4 = ivec4(v0.g, v0.b, v0.a, v1.r);
+                                }
+                                else if (aim4 == 2)
+                                {
+                                    v4 = ivec4(v0.b, v0.a, v1.r, v1.g);
+                                }
+                                else
+                                {
+                                    v4 = ivec4(v0.a, v1.r, v1.g, v1.b);
+                                }
+                                if (gk + 1 >= psc(GK)) v4.g = 0;
+                                if (gk + 2 >= psc(GK)) v4.b = 0;
+                                if (gk + 3 >= psc(GK)) v4.a = 0;
+                                v = packInt4x8(v4);
+                            }
+                        }
+                    }
+
+                    prefetch_tmp_a[q] = v;
+                }
+            }
+        }
+        {
+            const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N;
+            const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K;
+            const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK)
+                {
+                    int v = 0;
+
+                    if (constantB == 1)
+                    {
+                        const uint kk_full = kk / UNROLL_SG_K;
+
+                        if (0 < kk_full)
+                        {
+                            const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + sgni * K_Nd4_USGN_USGK;
+                            v = i8buffer_sm4(B_int8_data, b_offset + siq);
+                        }
+                        else
+                        {
+                            const uint zk = siq / K_Nd4_USGN;
+                            const uint sij = siq % K_Nd4_USGN;
+
+                            if (zk < kk)
+                            {
+                                const uint tail_offset = (wgni * kk * UNROLL_WG_N + sgni) * K_Nd4_USGN;
+                                v = i8buffer_sm4(B_int8_data, tail_offset + sij);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        const uint zk = siq / K_Nd4_USGN;
+                        const uint znij = siq % K_Nd4_USGN;
+                        const uint zn = znij / (K * Nd4);
+                        const uint ij = znij % (K * Nd4);
+                        const uint i = ij / Nd4;
+                        const uint j = ij % Nd4;
+
+                        const uint gk = zk * K + i;
+                        const uint gn = (ni + zn) * N + j * 4;
+
+                        if (gn < psc(GN) && gk < psc(GK))
+                        {
+                            ivec4 v4 = ivec4(0);
+                            uint bi = gn * psc(GK) + gk;
+                            v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                            if (gn + 1 < psc(GN))
+                            {
+                                bi += psc(GK);
+                                v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                            }
+                            if (gn + 2 < psc(GN))
+                            {
+                                bi += psc(GK);
+                                v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                            }
+                            if (gn + 3 < psc(GN))
+                            {
+                                bi += psc(GK);
+                                v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                            }
+                            v = packInt4x8(v4);
+                        }
+                    }
+
+                    prefetch_tmp_b[q] = v;
+                }
+            }
+        }
+
+        for (; k + UNROLL_SG_K < kk; k += UNROLL_SG_K)
+        {
+            // copy prefetched tile to shared memory
+            {
+                const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M;
+                const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K;
+                const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        const uint zk = siq / M_Kd4_USGM;
+                        const uint zmij = siq % M_Kd4_USGM;
+                        const uint zm = zmij / (M * Kd4);
+                        const uint ij = zmij % (M * Kd4);
+                        const uint i = ij / Kd4;
+                        const uint j = ij % Kd4;
+
+                        tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = prefetch_tmp_a[q];
+#elif ncnn_VK_NV_cooperative_matrix
+                        tmp_a[sgmi][siq] = prefetch_tmp_a[q];
+#endif
+                    }
+                }
+            }
+            {
+                const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N;
+                const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K;
+                const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        const uint zk = siq / K_Nd4_USGN;
+                        const uint znij = siq % K_Nd4_USGN;
+                        const uint zn = znij / (K * Nd4);
+                        const uint ij = znij % (K * Nd4);
+                        const uint i = ij / Nd4;
+                        const uint j = ij % Nd4;
+
+                        tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = prefetch_tmp_b[q];
+#elif ncnn_VK_NV_cooperative_matrix
+                        tmp_b[sgni][siq] = prefetch_tmp_b[q];
+#endif
+                    }
+                }
+            }
+
+            barrier();
+
+            // prefetch next tile
+            const uint ki = k + UNROLL_SG_K;
+            {
+                const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M;
+                const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K;
+                const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK)
+                    {
+                        int v = 0;
+
+                        if (constantA == 1)
+                        {
+                            const uint kk_full = kk / UNROLL_SG_K;
+
+                            if (ki / UNROLL_SG_K < kk_full)
+                            {
+                                const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + ((ki / UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM_USGK;
+                                v = i8buffer_sm4(A_int8_data, a_offset + siq);
+                            }
+                            else
+                            {
+                                const uint zk = siq / M_Kd4_USGM;
+                                const uint sij = siq % M_Kd4_USGM;
+
+                                if (ki + zk < kk)
+                                {
+                                    const uint tail_offset = (wgmi * kk * UNROLL_WG_M + kk_full * UNROLL_WG_M * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM;
+                                    v = i8buffer_sm4(A_int8_data, tail_offset + sij);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            const uint zk = siq / M_Kd4_USGM;
+                            const uint zmij = siq % M_Kd4_USGM;
+                            const uint zm = zmij / (M * Kd4);
+                            const uint ij = zmij % (M * Kd4);
+                            const uint i = ij / Kd4;
+                            const uint j = ij % Kd4;
+
+                            const uint gm = (mi + zm) * M + i;
+                            const uint gk = (ki + zk) * K + j * 4;
+
+                            if (gm < psc(GM) && gk < psc(GK))
+                            {
+                                if (psc(GK) % 4 == 0)
+                                {
+                                    v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4);
+                                }
+                                else
+                                {
+                                    const uint ai = gm * psc(GK) + gk;
+                                    const uint aim4 = ai % 4;
+                                    const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4);
+                                    ivec4 v1 = ivec4(0);
+                                    if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1);
+                                    ivec4 v4;
+                                    if (aim4 == 0)
+                                    {
+                                        v4 = v0;
+                                    }
+                                    else if (aim4 == 1)
+                                    {
+                                        v4 = ivec4(v0.g, v0.b, v0.a, v1.r);
+                                    }
+                                    else if (aim4 == 2)
+                                    {
+                                        v4 = ivec4(v0.b, v0.a, v1.r, v1.g);
+                                    }
+                                    else
+                                    {
+                                        v4 = ivec4(v0.a, v1.r, v1.g, v1.b);
+                                    }
+                                    if (gk + 1 >= psc(GK)) v4.g = 0;
+                                    if (gk + 2 >= psc(GK)) v4.b = 0;
+                                    if (gk + 3 >= psc(GK)) v4.a = 0;
+                                    v = packInt4x8(v4);
+                                }
+                            }
+                        }
+
+                        prefetch_tmp_a[q] = v;
+                    }
+                }
+            }
+            {
+                const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N;
+                const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K;
+                const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK)
+                    {
+                        int v = 0;
+
+                        if (constantB == 1)
+                        {
+                            const uint kk_full = kk / UNROLL_SG_K;
+
+                            if (ki / UNROLL_SG_K < kk_full)
+                            {
+                                const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN_USGK;
+                                v = i8buffer_sm4(B_int8_data, b_offset + siq);
+                            }
+                            else
+                            {
+                                const uint zk = siq / K_Nd4_USGN;
+                                const uint sij = siq % K_Nd4_USGN;
+
+                                if (ki + zk < kk)
+                                {
+                                    const uint tail_offset = (wgni * kk * UNROLL_WG_N + kk_full * UNROLL_WG_N * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN;
+                                    v = i8buffer_sm4(B_int8_data, tail_offset + sij);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            const uint zk = siq / K_Nd4_USGN;
+                            const uint znij = siq % K_Nd4_USGN;
+                            const uint zn = znij / (K * Nd4);
+                            const uint ij = znij % (K * Nd4);
+                            const uint i = ij / Nd4;
+                            const uint j = ij % Nd4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gn = (ni + zn) * N + j * 4;
+
+                            if (gn < psc(GN) && gk < psc(GK))
+                            {
+                                ivec4 v4 = ivec4(0);
+                                uint bi = gn * psc(GK) + gk;
+                                v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                if (gn + 1 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                if (gn + 2 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                if (gn + 3 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                v = packInt4x8(v4);
+                            }
+                        }
+
+                        prefetch_tmp_b[q] = v;
+                    }
+                }
+            }
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+
+        // copy and compute the last prefetched tile
+        {
+            const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M;
+            const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K;
+            const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+            [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    const uint zk = siq / M_Kd4_USGM;
+                    const uint zmij = siq % M_Kd4_USGM;
+                    const uint zm = zmij / (M * Kd4);
+                    const uint ij = zmij % (M * Kd4);
+                    const uint i = ij / Kd4;
+                    const uint j = ij % Kd4;
+
+                    tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = prefetch_tmp_a[q];
+#elif ncnn_VK_NV_cooperative_matrix
+                    tmp_a[sgmi][siq] = prefetch_tmp_a[q];
+#endif
+                }
+            }
+        }
+        {
+            const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N;
+            const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K;
+            const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+            [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++)
+            {
+                const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    const uint zk = siq / K_Nd4_USGN;
+                    const uint znij = siq % K_Nd4_USGN;
+                    const uint zn = znij / (K * Nd4);
+                    const uint ij = znij % (K * Nd4);
+                    const uint i = ij / Nd4;
+                    const uint j = ij % Nd4;
+
+                    tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = prefetch_tmp_b[q];
+#elif ncnn_VK_NV_cooperative_matrix
+                    tmp_b[sgni][siq] = prefetch_tmp_b[q];
+#endif
+                }
+            }
+        }
+
+        barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+        coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+        coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+        icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+        icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+        [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+        {
+            [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+            }
+
+            [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                    sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                }
+            }
+        }
+
+        barrier();
+    }
+    else
+    {
+        for (uint ki = 0; ki < kk; ki += UNROLL_SG_K)
+        {
+            {
+                const uint M_Kd4_USGM = M * Kd4 * UNROLL_SG_M;
+                const uint M_Kd4_USGM_USGK = M_Kd4_USGM * UNROLL_SG_K;
+                const uint M_Kd4_USGM_USGK_d_subgroupsize = (M_Kd4_USGM_USGK + (subgroup_size * UNROLL_WG_N - 1)) / (subgroup_size * UNROLL_WG_N);
+                [[unroll]] for (uint q = 0; q < M_Kd4_USGM_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_N + sgni) * subgroup_size + si;
+
+                    if (M_Kd4_USGM_USGK % (subgroup_size * UNROLL_WG_N) == 0 || siq < M_Kd4_USGM_USGK)
+                    {
+                        int v = 0;
+
+                        if (constantA == 1)
+                        {
+                            const uint kk_full = kk / UNROLL_SG_K;
+
+                            if (ki / UNROLL_SG_K < kk_full)
+                            {
+                                const uint a_offset = (wgmi * kk * UNROLL_WG_M) * M_Kd4_USGM + ((ki / UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM_USGK;
+                                v = i8buffer_sm4(A_int8_data, a_offset + siq);
+                            }
+                            else
+                            {
+                                const uint zk = siq / M_Kd4_USGM;
+                                const uint sij = siq % M_Kd4_USGM;
+
+                                if (ki + zk < kk)
+                                {
+                                    const uint tail_offset = (wgmi * kk * UNROLL_WG_M + kk_full * UNROLL_WG_M * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_M + sgmi) * M_Kd4_USGM;
+                                    v = i8buffer_sm4(A_int8_data, tail_offset + sij);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            const uint zk = siq / M_Kd4_USGM;
+                            const uint zmij = siq % M_Kd4_USGM;
+                            const uint zm = zmij / (M * Kd4);
+                            const uint ij = zmij % (M * Kd4);
+                            const uint i = ij / Kd4;
+                            const uint j = ij % Kd4;
+
+                            const uint gm = (mi + zm) * M + i;
+                            const uint gk = (ki + zk) * K + j * 4;
+
+                            if (gm < psc(GM) && gk < psc(GK))
+                            {
+                                if (psc(GK) % 4 == 0)
+                                {
+                                    v = i8buffer_sm4(A_int8_data, (gm * psc(GK) + gk) / 4);
+                                }
+                                else
+                                {
+                                    const uint ai = gm * psc(GK) + gk;
+                                    const uint aim4 = ai % 4;
+                                    const ivec4 v0 = i8buffer_ld4(A_int8_data, ai / 4);
+                                    ivec4 v1 = ivec4(0);
+                                    if (gk + 4 - aim4 < psc(GK)) v1 = i8buffer_ld4(A_int8_data, ai / 4 + 1);
+                                    ivec4 v4;
+                                    if (aim4 == 0)
+                                    {
+                                        v4 = v0;
+                                    }
+                                    else if (aim4 == 1)
+                                    {
+                                        v4 = ivec4(v0.g, v0.b, v0.a, v1.r);
+                                    }
+                                    else if (aim4 == 2)
+                                    {
+                                        v4 = ivec4(v0.b, v0.a, v1.r, v1.g);
+                                    }
+                                    else
+                                    {
+                                        v4 = ivec4(v0.a, v1.r, v1.g, v1.b);
+                                    }
+                                    if (gk + 1 >= psc(GK)) v4.g = 0;
+                                    if (gk + 2 >= psc(GK)) v4.b = 0;
+                                    if (gk + 3 >= psc(GK)) v4.a = 0;
+                                    v = packInt4x8(v4);
+                                }
+                            }
+                        }
+
+#if ncnn_VK_KHR_cooperative_matrix
+                        const uint zk = siq / M_Kd4_USGM;
+                        const uint zmij = siq % M_Kd4_USGM;
+                        const uint zm = zmij / (M * Kd4);
+                        const uint ij = zmij % (M * Kd4);
+                        const uint i = ij / Kd4;
+                        const uint j = ij % Kd4;
+
+                        tmp_a[sgmi][((zk * UNROLL_SG_M + zm) * M + i) * Kd4p + j] = v;
+#elif ncnn_VK_NV_cooperative_matrix
+                        tmp_a[sgmi][siq] = v;
+#endif
+                    }
+                }
+            }
+            {
+                const uint K_Nd4_USGN = K * Nd4 * UNROLL_SG_N;
+                const uint K_Nd4_USGN_USGK = K_Nd4_USGN * UNROLL_SG_K;
+                const uint K_Nd4_USGN_USGK_d_subgroupsize = (K_Nd4_USGN_USGK + (subgroup_size * UNROLL_WG_M - 1)) / (subgroup_size * UNROLL_WG_M);
+                [[unroll]] for (uint q = 0; q < K_Nd4_USGN_USGK_d_subgroupsize; q++)
+                {
+                    const uint siq = (q * UNROLL_WG_M + sgmi) * subgroup_size + si;
+
+                    if (K_Nd4_USGN_USGK % (subgroup_size * UNROLL_WG_M) == 0 || siq < K_Nd4_USGN_USGK)
+                    {
+                        int v = 0;
+
+                        if (constantB == 1)
+                        {
+                            const uint kk_full = kk / UNROLL_SG_K;
+
+                            if (ki / UNROLL_SG_K < kk_full)
+                            {
+                                const uint b_offset = (wgni * kk * UNROLL_WG_N) * K_Nd4_USGN + ((ki / UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN_USGK;
+                                v = i8buffer_sm4(B_int8_data, b_offset + siq);
+                            }
+                            else
+                            {
+                                const uint zk = siq / K_Nd4_USGN;
+                                const uint sij = siq % K_Nd4_USGN;
+
+                                if (ki + zk < kk)
+                                {
+                                    const uint tail_offset = (wgni * kk * UNROLL_WG_N + kk_full * UNROLL_WG_N * UNROLL_SG_K + (ki + zk - kk_full * UNROLL_SG_K) * UNROLL_WG_N + sgni) * K_Nd4_USGN;
+                                    v = i8buffer_sm4(B_int8_data, tail_offset + sij);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            const uint zk = siq / K_Nd4_USGN;
+                            const uint znij = siq % K_Nd4_USGN;
+                            const uint zn = znij / (K * Nd4);
+                            const uint ij = znij % (K * Nd4);
+                            const uint i = ij / Nd4;
+                            const uint j = ij % Nd4;
+
+                            const uint gk = (ki + zk) * K + i;
+                            const uint gn = (ni + zn) * N + j * 4;
+
+                            if (gn < psc(GN) && gk < psc(GK))
+                            {
+                                ivec4 v4 = ivec4(0);
+                                uint bi = gn * psc(GK) + gk;
+                                v4.r = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                if (gn + 1 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.g = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                if (gn + 2 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.b = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                if (gn + 3 < psc(GN))
+                                {
+                                    bi += psc(GK);
+                                    v4.a = i8buffer_ld4(B_int8_data, bi / 4)[bi % 4];
+                                }
+                                v = packInt4x8(v4);
+                            }
+                        }
+
+#if ncnn_VK_KHR_cooperative_matrix
+                        const uint zk = siq / K_Nd4_USGN;
+                        const uint znij = siq % K_Nd4_USGN;
+                        const uint zn = znij / (K * Nd4);
+                        const uint ij = znij % (K * Nd4);
+                        const uint i = ij / Nd4;
+                        const uint j = ij % Nd4;
+
+                        tmp_b[sgni][((zk * UNROLL_SG_N + zn) * K + i) * Nd4p + j] = v;
+#elif ncnn_VK_NV_cooperative_matrix
+                        tmp_b[sgni][siq] = v;
+#endif
+                    }
+                }
+            }
+
+            barrier();
+
+#if ncnn_VK_KHR_cooperative_matrix
+            coopmat<int8_t, gl_ScopeSubgroup, M, K, gl_MatrixUseA> A[UNROLL_SG_M];
+            coopmat<int8_t, gl_ScopeSubgroup, K, N, gl_MatrixUseB> B[UNROLL_SG_N];
+#elif ncnn_VK_NV_cooperative_matrix
+            icoopmatNV<8, gl_ScopeSubgroup, M, K> A[UNROLL_SG_M];
+            icoopmatNV<8, gl_ScopeSubgroup, K, N> B[UNROLL_SG_N];
+#endif
+
+            [[unroll]] for (uint zk = 0; zk < UNROLL_SG_K; zk++)
+            {
+                [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(A[zm], tmp_a[sgmi], (zk * UNROLL_SG_M + zm) * (M * Kd4p), Kd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+#if ncnn_VK_KHR_cooperative_matrix
+                    coopMatLoad(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                    coopMatLoadNV(B[zn], tmp_b[sgni], (zk * UNROLL_SG_N + zn) * (K * Nd4p), Nd4p, false);
+#endif
+                }
+
+                [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+                {
+                    [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+                    {
+#if ncnn_VK_KHR_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAdd(A[zm], B[zn], sum[zn][zm]);
+#elif ncnn_VK_NV_cooperative_matrix
+                        sum[zn][zm] = coopMatMulAddNV(A[zm], B[zn], sum[zn][zm]);
+#endif
+                    }
+                }
+            }
+
+            barrier();
+        }
+    }
+
+    [[unroll]] for (uint zn = 0; zn < UNROLL_SG_N; zn++)
+    {
+        [[unroll]] for (uint zm = 0; zm < UNROLL_SG_M; zm++)
+        {
+            if (output_transpose == 0 && psc(out_elempack) == 4)
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, gl_CooperativeMatrixLayoutColumnMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (Md4 * N), Md4, true);
+#endif
+            }
+            else
+            {
+#if ncnn_VK_KHR_cooperative_matrix
+                coopMatStore(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+                coopMatStoreNV(sum[zn][zm], tmp_o[sgi], (zn * UNROLL_SG_M + zm) * (M * Nd4), Nd4, false);
+#endif
+            }
+        }
+    }
+
+    barrier();
+
+    const int broadcast_type_C = constantC == 1 ? constant_broadcast_type_C : p.broadcast_type_C;
+
+    if (output_transpose == 0 && psc(out_elempack) == 4)
+    {
+        const uint Md4_N_USGM_USGN = Md4 * N * UNROLL_SG_M * UNROLL_SG_N;
+        const uint Md4_N_USGM_USGN_d_subgroupsize = (Md4_N_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < Md4_N_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (Md4_N_USGM_USGN % subgroup_size == 0 || siq < Md4_N_USGM_USGN)
+            {
+                const uint zn = siq / (Md4 * N * UNROLL_SG_M);
+                const uint zmij = siq % (Md4 * N * UNROLL_SG_M);
+                const uint zm = zmij / (Md4 * N);
+                const uint ij = zmij % (Md4 * N);
+                const uint i = ij / Md4;
+                const uint j = ij % Md4;
+
+                const uint gn = (ni + zn) * N + i;
+                const uint gm = (mi + zm) * Md4 + j;
+
+                if (gm * 4 < psc(GM) && gn < psc(GN))
+                {
+                    ivec4 sumi = tmp_o[sgi][siq];
+                    const uvec4 gm4 = gm * 4 + uvec4(0, 1, 2, 3);
+
+                    vec4 descale = vec4(0.f);
+                    descale.r = A_descales_data[gm4.r];
+                    if (gm4.g < psc(GM)) descale.g = A_descales_data[gm4.g];
+                    if (gm4.b < psc(GM)) descale.b = A_descales_data[gm4.b];
+                    if (gm4.a < psc(GM)) descale.a = A_descales_data[gm4.a];
+
+                    vec4 sumfp = vec4(sumi) * descale * B_descale_data[0];
+
+                    if (broadcast_type_C == 0)
+                    {
+                        sumfp += vec4(float(buffer_ld1(C_blob_data, 0)) * beta);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sumfp.r += float(buffer_ld1(C_blob_data, gm4.r)) * beta;
+                        if (gm4.g < psc(GM)) sumfp.g += float(buffer_ld1(C_blob_data, gm4.g)) * beta;
+                        if (gm4.b < psc(GM)) sumfp.b += float(buffer_ld1(C_blob_data, gm4.b)) * beta;
+                        if (gm4.a < psc(GM)) sumfp.a += float(buffer_ld1(C_blob_data, gm4.a)) * beta;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        sumfp.r += float(buffer_ld1(C_blob_data, gm4.r * psc(GN) + gn)) * beta;
+                        if (gm4.g < psc(GM)) sumfp.g += float(buffer_ld1(C_blob_data, gm4.g * psc(GN) + gn)) * beta;
+                        if (gm4.b < psc(GM)) sumfp.b += float(buffer_ld1(C_blob_data, gm4.b * psc(GN) + gn)) * beta;
+                        if (gm4.a < psc(GM)) sumfp.a += float(buffer_ld1(C_blob_data, gm4.a * psc(GN) + gn)) * beta;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        sumfp += vec4(float(buffer_ld1(C_blob_data, gn)) * beta);
+                    }
+
+                    if (alpha != 1.f)
+                    {
+                        sumfp *= alpha;
+                    }
+
+                    buffer_st4(top_blob_data_4, gm * p.outhstep + gn, afpvec4(sumfp));
+                }
+            }
+        }
+    }
+    else
+    {
+        const uint M_Nd4_USGM_USGN = M * Nd4 * UNROLL_SG_M * UNROLL_SG_N;
+        const uint M_Nd4_USGM_USGN_d_subgroupsize = (M_Nd4_USGM_USGN + subgroup_size - 1) / subgroup_size;
+        [[unroll]] for (uint q = 0; q < M_Nd4_USGM_USGN_d_subgroupsize; q++)
+        {
+            const uint siq = si + q * subgroup_size;
+
+            if (M_Nd4_USGM_USGN % subgroup_size == 0 || siq < M_Nd4_USGM_USGN)
+            {
+                const uint zn = siq / (M * Nd4 * UNROLL_SG_M);
+                const uint zmij = siq % (M * Nd4 * UNROLL_SG_M);
+                const uint zm = zmij / (M * Nd4);
+                const uint ij = zmij % (M * Nd4);
+                const uint i = ij / Nd4;
+                const uint j = ij % Nd4;
+
+                const uint gm = (mi + zm) * M + i;
+                const uint gn = (ni + zn) * N + j * 4;
+
+                if (gm < psc(GM) && gn < psc(GN))
+                {
+                    ivec4 sumi = tmp_o[sgi][siq];
+                    vec4 sumfp = vec4(sumi) * A_descales_data[gm] * B_descale_data[0];
+
+                    if (broadcast_type_C == 0)
+                    {
+                        sumfp += vec4(float(buffer_ld1(C_blob_data, 0)) * beta);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sumfp += vec4(float(buffer_ld1(C_blob_data, gm)) * beta);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        sumfp.r += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn)) * beta;
+                        if (gn + 1 < psc(GN)) sumfp.g += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 1)) * beta;
+                        if (gn + 2 < psc(GN)) sumfp.b += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 2)) * beta;
+                        if (gn + 3 < psc(GN)) sumfp.a += float(buffer_ld1(C_blob_data, gm * psc(GN) + gn + 3)) * beta;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        sumfp.r += float(buffer_ld1(C_blob_data, gn)) * beta;
+                        if (gn + 1 < psc(GN)) sumfp.g += float(buffer_ld1(C_blob_data, gn + 1)) * beta;
+                        if (gn + 2 < psc(GN)) sumfp.b += float(buffer_ld1(C_blob_data, gn + 2)) * beta;
+                        if (gn + 3 < psc(GN)) sumfp.a += float(buffer_ld1(C_blob_data, gn + 3)) * beta;
+                    }
+
+                    if (alpha != 1.f)
+                    {
+                        sumfp *= alpha;
+                    }
+
+                    if (output_transpose == 1)
+                    {
+                        if (psc(out_elempack) == 4)
+                        {
+                            buffer_st4(top_blob_data_4, (gn / 4) * p.outhstep + gm, afpvec4(sumfp));
+                        }
+                        else
+                        {
+                            buffer_st1(top_blob_data, gn * p.outhstep + gm, sumfp.r);
+                            if (gn + 1 < psc(GN)) buffer_st1(top_blob_data, (gn + 1) * p.outhstep + gm, sumfp.g);
+                            if (gn + 2 < psc(GN)) buffer_st1(top_blob_data, (gn + 2) * p.outhstep + gm, sumfp.b);
+                            if (gn + 3 < psc(GN)) buffer_st1(top_blob_data, (gn + 3) * p.outhstep + gm, sumfp.a);
+                        }
+                    }
+                    else
+                    {
+                        buffer_st1(top_blob_data, gm * p.outhstep + gn, sumfp.r);
+                        if (gn + 1 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 1, sumfp.g);
+                        if (gn + 2 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 2, sumfp.b);
+                        if (gn + 3 < psc(GN)) buffer_st1(top_blob_data, gm * p.outhstep + gn + 3, sumfp.a);
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/gemm_quantize_A_int8.comp b/src/layer/vulkan/shader/gemm_quantize_A_int8.comp
new file mode 100644
index 000000000000..ab5c97518a3f
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_quantize_A_int8.comp
@@ -0,0 +1,46 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int transA = 0;
+
+layout(binding = 0) readonly buffer A_blob { sfp A_blob_data[]; };
+layout(binding = 1) writeonly buffer A_int8_blob { sint8 A_int8_data[]; };
+layout(binding = 2) writeonly buffer A_descales_blob { float A_descales_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int M;
+    int K;
+    int A_dims;
+    int A_hstep;
+} p;
+
+void main()
+{
+    const int i = int(gl_GlobalInvocationID.x);
+
+    if (i >= p.M)
+        return;
+
+    float absmax = 0.f;
+
+    for (int k = 0; k < p.K; k++)
+    {
+        const int ai = transA == 1 ? k * p.A_hstep + i : i * p.A_hstep + k;
+        const float v = float(buffer_ld1(A_blob_data, ai));
+        absmax = max(absmax, abs(v));
+    }
+
+    const float scale = absmax == 0.f ? 1.f : 127.f / absmax;
+    A_descales_data[i] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f);
+
+    for (int k = 0; k < p.K; k++)
+    {
+        const int ai = transA == 1 ? k * p.A_hstep + i : i * p.A_hstep + k;
+        const float v = float(buffer_ld1(A_blob_data, ai)) * scale;
+        const int vi = float2int8(v);
+        i8buffer_st1(A_int8_data, i * p.K + k, vi);
+    }
+}
diff --git a/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp
new file mode 100644
index 000000000000..13a77bffe7e6
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_quantize_B_absmax_int8.comp
@@ -0,0 +1,53 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int transB = 0;
+
+layout(binding = 0) readonly buffer B_blob { sfp B_blob_data[]; };
+layout(binding = 1) writeonly buffer B_absmax_blob { float B_absmax_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int N;
+    int K;
+    int B_dims;
+    int B_hstep;
+    int size;
+} p;
+
+shared float absmax_shared[128];
+
+void main()
+{
+    const int lid = int(gl_LocalInvocationID.x);
+    const int block_offset = int(gl_WorkGroupID.x) * 1024;
+    const int block_end = min(block_offset + 1024, p.size);
+
+    float absmax = 0.f;
+
+    for (int i = block_offset + lid; i < block_end; i += 128)
+    {
+        const int j = i / p.K;
+        const int k = i - j * p.K;
+        const int bi = transB == 1 ? j * p.B_hstep + k : k * p.B_hstep + j;
+        const float v = float(buffer_ld1(B_blob_data, bi));
+        absmax = max(absmax, abs(v));
+    }
+
+    absmax_shared[lid] = absmax;
+
+    barrier();
+
+    for (int stride = 64; stride > 0; stride >>= 1)
+    {
+        if (lid < stride)
+            absmax_shared[lid] = max(absmax_shared[lid], absmax_shared[lid + stride]);
+
+        barrier();
+    }
+
+    if (lid == 0)
+        B_absmax_data[int(gl_WorkGroupID.x)] = absmax_shared[0];
+}
diff --git a/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp
new file mode 100644
index 000000000000..64e51ed76c66
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_quantize_B_descale_int8.comp
@@ -0,0 +1,44 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(binding = 0) readonly buffer B_absmax_blob { float B_absmax_data[]; };
+layout(binding = 1) writeonly buffer B_descale_blob { float B_descale_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int blocks;
+} p;
+
+shared float absmax_shared[128];
+
+void main()
+{
+    const int lid = int(gl_LocalInvocationID.x);
+
+    float absmax = 0.f;
+
+    for (int i = lid; i < p.blocks; i += 128)
+    {
+        absmax = max(absmax, B_absmax_data[i]);
+    }
+
+    absmax_shared[lid] = absmax;
+
+    barrier();
+
+    for (int stride = 64; stride > 0; stride >>= 1)
+    {
+        if (lid < stride)
+            absmax_shared[lid] = max(absmax_shared[lid], absmax_shared[lid + stride]);
+
+        barrier();
+    }
+
+    if (lid == 0)
+    {
+        absmax = absmax_shared[0];
+        B_descale_data[0] = absmax == 0.f ? 0.f : absmax * (1.f / 127.f);
+    }
+}
diff --git a/src/layer/vulkan/shader/gemm_quantize_B_int8.comp b/src/layer/vulkan/shader/gemm_quantize_B_int8.comp
new file mode 100644
index 000000000000..704a5ca340f0
--- /dev/null
+++ b/src/layer/vulkan/shader/gemm_quantize_B_int8.comp
@@ -0,0 +1,57 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int transB = 0;
+
+layout(binding = 0) readonly buffer B_blob { sfp B_blob_data[]; };
+layout(binding = 1) writeonly buffer B_int8_blob { sint8 B_int8_data[]; };
+layout(binding = 2) readonly buffer B_descale_blob { float B_descale_data[]; };
+layout(binding = 3) writeonly buffer B_int8_blob_pack4 { sint8vec4 B_int8_data_pack4[]; };
+
+layout(push_constant) uniform parameter
+{
+    int N;
+    int K;
+    int B_dims;
+    int B_hstep;
+    int size;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x);
+    const int i = gx * 4;
+
+    if (i >= p.size)
+        return;
+
+    const float descale = B_descale_data[0];
+    const float scale = descale == 0.f ? 1.f : 1.f / descale;
+
+    ivec4 v = ivec4(0);
+    for (int lane = 0; lane < 4; lane++)
+    {
+        const int i1 = i + lane;
+        if (i1 >= p.size)
+            break;
+
+        const int j = i1 / p.K;
+        const int k = i1 - j * p.K;
+        const int bi = transB == 1 ? j * p.B_hstep + k : k * p.B_hstep + j;
+        const float vf = float(buffer_ld1(B_blob_data, bi)) * scale;
+        v[lane] = float2int8(vf);
+    }
+
+    if (i + 3 < p.size)
+    {
+        i8buffer_st4(B_int8_data_pack4, gx, v);
+    }
+    else
+    {
+        i8buffer_st1(B_int8_data, i + 0, v.r);
+        if (i + 1 < p.size) i8buffer_st1(B_int8_data, i + 1, v.g);
+        if (i + 2 < p.size) i8buffer_st1(B_int8_data, i + 2, v.b);
+    }
+}
diff --git a/src/layer/vulkan/shader/innerproduct_gemm_int8.comp b/src/layer/vulkan/shader/innerproduct_gemm_int8.comp
new file mode 100644
index 000000000000..90cc11673e6e
--- /dev/null
+++ b/src/layer/vulkan/shader/innerproduct_gemm_int8.comp
@@ -0,0 +1,415 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+#define LOCAL_MEMORY_UNROLL_INCH 8
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int input_elempack = 1;
+layout(constant_id = 5) const int weight_data_stride = 0;
+
+#define shape_constant_id_offset 6
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob_1 { sint8 bottom_blob_int8_data_1[]; };
+layout(binding = 1) writeonly buffer top_blob_1 { sfp top_blob_data_1[]; };
+layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; };
+layout(binding = 4) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+layout(binding = 5) writeonly buffer top_blob_4 { sfpvec4 top_blob_data_4[]; };
+layout(binding = 6) readonly buffer bottom_blob_4 { sint8vec4 bottom_blob_int8_data_4[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#if NCNN_shader_local_memory
+// avoid bank conflict
+#define PAD 1
+shared int tmp_v[8][LOCAL_MEMORY_UNROLL_INCH + PAD];
+shared int tmp_k[8][LOCAL_MEMORY_UNROLL_INCH + PAD];
+#endif
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+    const uint gz = gl_GlobalInvocationID.z;
+
+#if !NCNN_shader_local_memory
+    if (gx * 4 >= psc(outw) || gy * 4 >= psc(outh) || gz >= 1)
+        return;
+#else
+    if (gz >= 1)
+        return;
+#endif
+
+    const int elempack = input_elempack == 0 ? psc(c) : input_elempack;
+    const int output_elempack = elempack;
+
+    ivec4 sum0 = ivec4(0);
+    ivec4 sum1 = ivec4(0);
+    ivec4 sum2 = ivec4(0);
+    ivec4 sum3 = ivec4(0);
+
+#if NCNN_shader_local_memory
+    const uint lx = gl_LocalInvocationID.x;
+    const uint ly = gl_LocalInvocationID.y;
+
+    for (int i = 0; i < weight_data_stride * 4; i += LOCAL_MEMORY_UNROLL_INCH)
+    {
+        {
+            int v = 0;
+            const int vk = i + int(lx / 4) * 4;
+            const uint vy = gy * 4 + lx % 4;
+            if (vy < psc(outh) && vk < psc(w))
+            {
+                if (elempack == 4)
+                {
+                    ivec4 v4 = ivec4(0);
+                    const int vy4 = int(vy) / 4;
+                    const int vlane = int(vy) % 4;
+                    const int vi = vy4 * psc(cstep) + vk;
+                    v4.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane];
+                    if (vk + 1 < psc(w)) v4.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane];
+                    if (vk + 2 < psc(w)) v4.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane];
+                    if (vk + 3 < psc(w)) v4.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane];
+
+                    v = packInt4x8(v4);
+                }
+                else // elempack == 1
+                {
+                    ivec4 v4 = ivec4(0);
+                    const int vi = int(vy) * psc(cstep) + vk;
+                    v4.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0);
+                    if (vk + 1 < psc(w)) v4.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1);
+                    if (vk + 2 < psc(w)) v4.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2);
+                    if (vk + 3 < psc(w)) v4.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3);
+
+                    v = packInt4x8(v4);
+                }
+            }
+
+            tmp_v[ly][lx] = v;
+
+            const int kk = i + int(ly / 4) * 4;
+            const int w_offset = int(gx) * weight_data_stride + kk / 4;
+            const int k = i8buffer_sm4(weight_data, w_offset * 4 + int(ly % 4));
+
+            tmp_k[lx][ly] = k;
+        }
+
+        barrier();
+
+        for (int k4 = 0; k4 < LOCAL_MEMORY_UNROLL_INCH / 4; k4++)
+        {
+            const int kk = k4 * 4;
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+            const int v0 = tmp_v[ly][kk + 0];
+            const int v1 = tmp_v[ly][kk + 1];
+            const int v2 = tmp_v[ly][kk + 2];
+            const int v3 = tmp_v[ly][kk + 3];
+
+            const int k0 = tmp_k[lx][kk + 0];
+            const int k1 = tmp_k[lx][kk + 1];
+            const int k2 = tmp_k[lx][kk + 2];
+            const int k3 = tmp_k[lx][kk + 3];
+
+            sum0 += ivec4(dotPacked4x8EXT(v0, k0), dotPacked4x8EXT(v0, k1), dotPacked4x8EXT(v0, k2), dotPacked4x8EXT(v0, k3));
+            sum1 += ivec4(dotPacked4x8EXT(v1, k0), dotPacked4x8EXT(v1, k1), dotPacked4x8EXT(v1, k2), dotPacked4x8EXT(v1, k3));
+            sum2 += ivec4(dotPacked4x8EXT(v2, k0), dotPacked4x8EXT(v2, k1), dotPacked4x8EXT(v2, k2), dotPacked4x8EXT(v2, k3));
+            sum3 += ivec4(dotPacked4x8EXT(v3, k0), dotPacked4x8EXT(v3, k1), dotPacked4x8EXT(v3, k2), dotPacked4x8EXT(v3, k3));
+#else
+            const ivec4 v0 = unpackInt4x8(tmp_v[ly][kk + 0]);
+            const ivec4 v1 = unpackInt4x8(tmp_v[ly][kk + 1]);
+            const ivec4 v2 = unpackInt4x8(tmp_v[ly][kk + 2]);
+            const ivec4 v3 = unpackInt4x8(tmp_v[ly][kk + 3]);
+
+            const ivec4 k0 = unpackInt4x8(tmp_k[lx][kk + 0]);
+            const ivec4 k1 = unpackInt4x8(tmp_k[lx][kk + 1]);
+            const ivec4 k2 = unpackInt4x8(tmp_k[lx][kk + 2]);
+            const ivec4 k3 = unpackInt4x8(tmp_k[lx][kk + 3]);
+
+            sum0.r += v0.r * k0.r + v0.g * k0.g + v0.b * k0.b + v0.a * k0.a;
+            sum0.g += v0.r * k1.r + v0.g * k1.g + v0.b * k1.b + v0.a * k1.a;
+            sum0.b += v0.r * k2.r + v0.g * k2.g + v0.b * k2.b + v0.a * k2.a;
+            sum0.a += v0.r * k3.r + v0.g * k3.g + v0.b * k3.b + v0.a * k3.a;
+            sum1.r += v1.r * k0.r + v1.g * k0.g + v1.b * k0.b + v1.a * k0.a;
+            sum1.g += v1.r * k1.r + v1.g * k1.g + v1.b * k1.b + v1.a * k1.a;
+            sum1.b += v1.r * k2.r + v1.g * k2.g + v1.b * k2.b + v1.a * k2.a;
+            sum1.a += v1.r * k3.r + v1.g * k3.g + v1.b * k3.b + v1.a * k3.a;
+            sum2.r += v2.r * k0.r + v2.g * k0.g + v2.b * k0.b + v2.a * k0.a;
+            sum2.g += v2.r * k1.r + v2.g * k1.g + v2.b * k1.b + v2.a * k1.a;
+            sum2.b += v2.r * k2.r + v2.g * k2.g + v2.b * k2.b + v2.a * k2.a;
+            sum2.a += v2.r * k3.r + v2.g * k3.g + v2.b * k3.b + v2.a * k3.a;
+            sum3.r += v3.r * k0.r + v3.g * k0.g + v3.b * k0.b + v3.a * k0.a;
+            sum3.g += v3.r * k1.r + v3.g * k1.g + v3.b * k1.b + v3.a * k1.a;
+            sum3.b += v3.r * k2.r + v3.g * k2.g + v3.b * k2.b + v3.a * k2.a;
+            sum3.a += v3.r * k3.r + v3.g * k3.g + v3.b * k3.b + v3.a * k3.a;
+#endif
+        }
+
+        barrier();
+    }
+
+    if (gx * 4 >= psc(outw) || gy * 4 >= psc(outh))
+        return;
+#else
+    for (int i = 0; i < weight_data_stride * 4; i += 4)
+    {
+        const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3);
+        const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3);
+
+        int v0 = 0;
+        int v1 = 0;
+        int v2 = 0;
+        int v3 = 0;
+
+        if (i < psc(w))
+        {
+            if (elempack == 4)
+            {
+                if (gy4.r < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vy4 = int(gy4.r) / 4;
+                    const int vlane = int(gy4.r) % 4;
+                    const int vi = vy4 * psc(cstep) + i;
+                    v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane];
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane];
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane];
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane];
+
+                    v0 = packInt4x8(v);
+                }
+                if (gy4.g < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vy4 = int(gy4.g) / 4;
+                    const int vlane = int(gy4.g) % 4;
+                    const int vi = vy4 * psc(cstep) + i;
+                    v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane];
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane];
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane];
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane];
+
+                    v1 = packInt4x8(v);
+                }
+                if (gy4.b < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vy4 = int(gy4.b) / 4;
+                    const int vlane = int(gy4.b) % 4;
+                    const int vi = vy4 * psc(cstep) + i;
+                    v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane];
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane];
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane];
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane];
+
+                    v2 = packInt4x8(v);
+                }
+                if (gy4.a < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vy4 = int(gy4.a) / 4;
+                    const int vlane = int(gy4.a) % 4;
+                    const int vi = vy4 * psc(cstep) + i;
+                    v.r = i8buffer_ld4(bottom_blob_int8_data_4, vi + 0)[vlane];
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld4(bottom_blob_int8_data_4, vi + 1)[vlane];
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld4(bottom_blob_int8_data_4, vi + 2)[vlane];
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld4(bottom_blob_int8_data_4, vi + 3)[vlane];
+
+                    v3 = packInt4x8(v);
+                }
+            }
+            else // elempack == 1
+            {
+                if (gy4.r < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vi = int(gy4.r) * psc(cstep) + i;
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0);
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1);
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2);
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3);
+
+                    v0 = packInt4x8(v);
+                }
+                if (gy4.g < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vi = int(gy4.g) * psc(cstep) + i;
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0);
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1);
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2);
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3);
+
+                    v1 = packInt4x8(v);
+                }
+                if (gy4.b < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vi = int(gy4.b) * psc(cstep) + i;
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0);
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1);
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2);
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3);
+
+                    v2 = packInt4x8(v);
+                }
+                if (gy4.a < psc(outh))
+                {
+                    ivec4 v = ivec4(0);
+                    const int vi = int(gy4.a) * psc(cstep) + i;
+                    v.r = i8buffer_ld1(bottom_blob_int8_data_1, vi + 0);
+                    if (i + 1 < psc(w)) v.g = i8buffer_ld1(bottom_blob_int8_data_1, vi + 1);
+                    if (i + 2 < psc(w)) v.b = i8buffer_ld1(bottom_blob_int8_data_1, vi + 2);
+                    if (i + 3 < psc(w)) v.a = i8buffer_ld1(bottom_blob_int8_data_1, vi + 3);
+
+                    v3 = packInt4x8(v);
+                }
+            }
+        }
+
+        const int wi = i / 4;
+        const int w_offset = int(gx) * weight_data_stride + wi;
+
+        const int k0 = i8buffer_sm4(weight_data, w_offset * 4 + 0);
+        const int k1 = i8buffer_sm4(weight_data, w_offset * 4 + 1);
+        const int k2 = i8buffer_sm4(weight_data, w_offset * 4 + 2);
+        const int k3 = i8buffer_sm4(weight_data, w_offset * 4 + 3);
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        sum0 += ivec4(dotPacked4x8EXT(v0, k0), dotPacked4x8EXT(v0, k1), dotPacked4x8EXT(v0, k2), dotPacked4x8EXT(v0, k3));
+        sum1 += ivec4(dotPacked4x8EXT(v1, k0), dotPacked4x8EXT(v1, k1), dotPacked4x8EXT(v1, k2), dotPacked4x8EXT(v1, k3));
+        sum2 += ivec4(dotPacked4x8EXT(v2, k0), dotPacked4x8EXT(v2, k1), dotPacked4x8EXT(v2, k2), dotPacked4x8EXT(v2, k3));
+        sum3 += ivec4(dotPacked4x8EXT(v3, k0), dotPacked4x8EXT(v3, k1), dotPacked4x8EXT(v3, k2), dotPacked4x8EXT(v3, k3));
+#else
+        const ivec4 v0v = unpackInt4x8(v0);
+        const ivec4 v1v = unpackInt4x8(v1);
+        const ivec4 v2v = unpackInt4x8(v2);
+        const ivec4 v3v = unpackInt4x8(v3);
+
+        const ivec4 k0v = unpackInt4x8(k0);
+        const ivec4 k1v = unpackInt4x8(k1);
+        const ivec4 k2v = unpackInt4x8(k2);
+        const ivec4 k3v = unpackInt4x8(k3);
+
+        sum0.r += v0v.r * k0v.r + v0v.g * k0v.g + v0v.b * k0v.b + v0v.a * k0v.a;
+        sum0.g += v0v.r * k1v.r + v0v.g * k1v.g + v0v.b * k1v.b + v0v.a * k1v.a;
+        sum0.b += v0v.r * k2v.r + v0v.g * k2v.g + v0v.b * k2v.b + v0v.a * k2v.a;
+        sum0.a += v0v.r * k3v.r + v0v.g * k3v.g + v0v.b * k3v.b + v0v.a * k3v.a;
+        sum1.r += v1v.r * k0v.r + v1v.g * k0v.g + v1v.b * k0v.b + v1v.a * k0v.a;
+        sum1.g += v1v.r * k1v.r + v1v.g * k1v.g + v1v.b * k1v.b + v1v.a * k1v.a;
+        sum1.b += v1v.r * k2v.r + v1v.g * k2v.g + v1v.b * k2v.b + v1v.a * k2v.a;
+        sum1.a += v1v.r * k3v.r + v1v.g * k3v.g + v1v.b * k3v.b + v1v.a * k3v.a;
+        sum2.r += v2v.r * k0v.r + v2v.g * k0v.g + v2v.b * k0v.b + v2v.a * k0v.a;
+        sum2.g += v2v.r * k1v.r + v2v.g * k1v.g + v2v.b * k1v.b + v2v.a * k1v.a;
+        sum2.b += v2v.r * k2v.r + v2v.g * k2v.g + v2v.b * k2v.b + v2v.a * k2v.a;
+        sum2.a += v2v.r * k3v.r + v2v.g * k3v.g + v2v.b * k3v.b + v2v.a * k3v.a;
+        sum3.r += v3v.r * k0v.r + v3v.g * k0v.g + v3v.b * k0v.b + v3v.a * k0v.a;
+        sum3.g += v3v.r * k1v.r + v3v.g * k1v.g + v3v.b * k1v.b + v3v.a * k1v.a;
+        sum3.b += v3v.r * k2v.r + v3v.g * k2v.g + v3v.b * k2v.b + v3v.a * k2v.a;
+        sum3.a += v3v.r * k3v.r + v3v.g * k3v.g + v3v.b * k3v.b + v3v.a * k3v.a;
+#endif
+    }
+#endif
+
+    const uvec4 gx4 = gx * 4 + uvec4(0, 1, 2, 3);
+    const uvec4 gy4 = gy * 4 + uvec4(0, 1, 2, 3);
+
+    const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx));
+
+    vec4 sumfp0 = vec4(sum0) * descale;
+    vec4 sumfp1 = vec4(sum1) * descale;
+    vec4 sumfp2 = vec4(sum2) * descale;
+    vec4 sumfp3 = vec4(sum3) * descale;
+
+    if (bias_term == 1)
+    {
+        vec4 bias = vec4(buffer_ld4(bias_data, gx));
+
+        sumfp0 += bias;
+        sumfp1 += bias;
+        sumfp2 += bias;
+        sumfp3 += bias;
+    }
+
+    afpvec4 out0 = activation_afpvec4(afpvec4(sumfp0), activation_type, activation_param_0, activation_param_1);
+    afpvec4 out1 = activation_afpvec4(afpvec4(sumfp1), activation_type, activation_param_0, activation_param_1);
+    afpvec4 out2 = activation_afpvec4(afpvec4(sumfp2), activation_type, activation_param_0, activation_param_1);
+    afpvec4 out3 = activation_afpvec4(afpvec4(sumfp3), activation_type, activation_param_0, activation_param_1);
+
+    if (output_elempack == 4)
+    {
+        const uint gi = gy * uint(psc(outw)) + gx * 4;
+
+        buffer_st4(top_blob_data_4, gi + 0, afpvec4(out0.r, out1.r, out2.r, out3.r));
+        if (gx4.g < psc(outw)) buffer_st4(top_blob_data_4, gi + 1, afpvec4(out0.g, out1.g, out2.g, out3.g));
+        if (gx4.b < psc(outw)) buffer_st4(top_blob_data_4, gi + 2, afpvec4(out0.b, out1.b, out2.b, out3.b));
+        if (gx4.a < psc(outw)) buffer_st4(top_blob_data_4, gi + 3, afpvec4(out0.a, out1.a, out2.a, out3.a));
+
+        return;
+    }
+
+    const uvec4 gi4 = gy4 * uint(psc(outw)) + gx * 4;
+
+    buffer_st1(top_blob_data_1, gi4.r, out0.r);
+    if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 1, out0.g);
+    if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 2, out0.b);
+    if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.r + 3, out0.a);
+    if (gy4.g < psc(outh))
+    {
+        buffer_st1(top_blob_data_1, gi4.g, out1.r);
+        if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 1, out1.g);
+        if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 2, out1.b);
+        if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.g + 3, out1.a);
+    }
+    if (gy4.b < psc(outh))
+    {
+        buffer_st1(top_blob_data_1, gi4.b, out2.r);
+        if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 1, out2.g);
+        if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 2, out2.b);
+        if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.b + 3, out2.a);
+    }
+    if (gy4.a < psc(outh))
+    {
+        buffer_st1(top_blob_data_1, gi4.a, out3.r);
+        if (gx4.g < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 1, out3.g);
+        if (gx4.b < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 2, out3.b);
+        if (gx4.a < psc(outw)) buffer_st1(top_blob_data_1, gi4.a + 3, out3.a);
+    }
+}
diff --git a/src/layer/vulkan/shader/innerproduct_int8.comp b/src/layer/vulkan/shader/innerproduct_int8.comp
new file mode 100644
index 000000000000..0bbd791c01f7
--- /dev/null
+++ b/src/layer/vulkan/shader/innerproduct_int8.comp
@@ -0,0 +1,87 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+layout(constant_id = 4) const int weight_data_stride = 0;
+
+#define shape_constant_id_offset 5
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int outw = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_blob { sint8vec4 weight_data[]; };
+layout(binding = 3) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; };
+layout(binding = 4) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int outw;
+} p;
+
+void main()
+{
+    const int gx = int(gl_GlobalInvocationID.x) * 4;
+    const int gy = int(gl_GlobalInvocationID.y);
+    const int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 sum = ivec4(0);
+
+    for (int i = 0; i < psc(w); i += 4)
+    {
+        const int v4 = i8buffer_sm4(bottom_blob_int8_data, i / 4);
+        const int wi = i / 4;
+        const int w_offset = gx / 4 * weight_data_stride + wi;
+
+        const int k0 = i8buffer_sm4(weight_data, w_offset * 4 + 0);
+        const int k1 = i8buffer_sm4(weight_data, w_offset * 4 + 1);
+        const int k2 = i8buffer_sm4(weight_data, w_offset * 4 + 2);
+        const int k3 = i8buffer_sm4(weight_data, w_offset * 4 + 3);
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        sum += ivec4(dotPacked4x8EXT(v4, k0), dotPacked4x8EXT(v4, k1), dotPacked4x8EXT(v4, k2), dotPacked4x8EXT(v4, k3));
+#else
+        const ivec4 v0 = unpackInt4x8(v4);
+        const ivec4 k0v = unpackInt4x8(k0);
+        const ivec4 k1v = unpackInt4x8(k1);
+        const ivec4 k2v = unpackInt4x8(k2);
+        const ivec4 k3v = unpackInt4x8(k3);
+
+        sum += ivec4(v0.r * k0v.r + v0.g * k0v.g + v0.b * k0v.b + v0.a * k0v.a,
+                     v0.r * k1v.r + v0.g * k1v.g + v0.b * k1v.b + v0.a * k1v.a,
+                     v0.r * k2v.r + v0.g * k2v.g + v0.b * k2v.b + v0.a * k2v.a,
+                     v0.r * k3v.r + v0.g * k3v.g + v0.b * k3v.b + v0.a * k3v.a);
+#endif
+    }
+
+    const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx / 4));
+
+    afpvec4 sumfp32 = afpvec4(vec4(sum) * descale);
+
+    if (bias_term == 1)
+    {
+        vec4 bias = vec4(buffer_ld4(bias_data, gx / 4));
+
+        sumfp32 += afpvec4(bias);
+    }
+
+    sumfp32 = activation_afpvec4(sumfp32, activation_type, activation_param_0, activation_param_1);
+
+    buffer_st4(top_blob_data, gx / 4, sumfp32);
+}
diff --git a/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp b/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp
new file mode 100644
index 000000000000..41f70a653503
--- /dev/null
+++ b/src/layer/vulkan/shader/innerproduct_reduce_sum8_int8.comp
@@ -0,0 +1,69 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_GOOGLE_include_directive : enable
+#include "vulkan_activation.comp"
+
+layout(constant_id = 0) const int bias_term = 0;
+layout(constant_id = 1) const int activation_type = 0;
+layout(constant_id = 2) const float activation_param_0 = 0;
+layout(constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
+
+layout(constant_id = shape_constant_id_offset + 2) const int outw = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_descales_blob { sfpvec4 weight_descales_data[]; };
+layout(binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+
+    int outw;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec4 sumfp32;
+
+    if (bias_term == 1)
+    {
+        sumfp32 = buffer_ld4(bias_data, gx);
+    }
+    else
+    {
+        sumfp32 = afpvec4(0.f);
+    }
+
+    int v_offset = gx * psc(w);
+
+    ivec4 sum = ivec4(0);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        sum += bottom_blob_data[v_offset + i];
+    }
+
+    const vec4 descale = vec4(buffer_ld4(weight_descales_data, gx));
+
+    sumfp32 += afpvec4(vec4(sum) * descale);
+
+    sumfp32 = activation_afpvec4(sumfp32, activation_type, activation_param_0, activation_param_1);
+
+    buffer_st4(top_blob_data, gx, sumfp32);
+}
diff --git a/src/layer/vulkan/shader/innerproduct_sum8_int8.comp b/src/layer/vulkan/shader/innerproduct_sum8_int8.comp
new file mode 100644
index 000000000000..d6d84378d68c
--- /dev/null
+++ b/src/layer/vulkan/shader/innerproduct_sum8_int8.comp
@@ -0,0 +1,75 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+#extension GL_EXT_integer_dot_product : require
+#endif
+
+layout(constant_id = 0) const int weight_data_stride = 0;
+
+#define shape_constant_id_offset 1
+layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
+
+layout(constant_id = shape_constant_id_offset + 1) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int outh = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_int8_data[]; };
+layout(binding = 1) writeonly buffer top_blob { ivec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer weight_blob { ivec4 weight_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+
+    int outw;
+    int outh;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= 1)
+        return;
+
+    const int start = gx * 8;
+    const int end = min(gx * 8 + 8, (psc(w) + 3) / 4);
+
+    ivec4 sum = ivec4(0);
+
+    int w_offset = gy * weight_data_stride + start;
+
+    for (int i = start; i < end; i++)
+    {
+        const int v4 = i8buffer_sm4(bottom_blob_int8_data, i);
+
+        const ivec4 k = weight_data[w_offset];
+
+#if ncnn_VK_KHR_shader_integer_dot_product && ncnn_shaderIntegerDotProduct && ncnn_integerDotProduct4x8BitPackedSignedAccelerated
+        sum.r += dotPacked4x8EXT(v4, k.x);
+        sum.g += dotPacked4x8EXT(v4, k.y);
+        sum.b += dotPacked4x8EXT(v4, k.z);
+        sum.a += dotPacked4x8EXT(v4, k.w);
+#else
+        const ivec4 v0 = unpackInt4x8(v4);
+        const ivec4 k0v = unpackInt4x8(k.x);
+        const ivec4 k1v = unpackInt4x8(k.y);
+        const ivec4 k2v = unpackInt4x8(k.z);
+        const ivec4 k3v = unpackInt4x8(k.w);
+
+        sum.r += v0.r * k0v.r + v0.g * k0v.g + v0.b * k0v.b + v0.a * k0v.a;
+        sum.g += v0.r * k1v.r + v0.g * k1v.g + v0.b * k1v.b + v0.a * k1v.a;
+        sum.b += v0.r * k2v.r + v0.g * k2v.g + v0.b * k2v.b + v0.a * k2v.a;
+        sum.a += v0.r * k3v.r + v0.g * k3v.g + v0.b * k3v.b + v0.a * k3v.a;
+#endif
+
+        w_offset += 1;
+    }
+
+    const int gi = gy * psc(outw) + gx;
+    top_blob_data[gi] = sum;
+}
diff --git a/src/layer/vulkan/shader/packing_pack1to4_int8.comp b/src/layer/vulkan/shader/packing_pack1to4_int8.comp
index 354094ec6e80..4119a2d28832 100644
--- a/src/layer/vulkan/shader/packing_pack1to4_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack1to4_int8.comp
@@ -35,12 +35,6 @@ void main()
 
     const uint gi = gy * psc(n) + gx;
 
-    //     if (cast_type_from == cast_type_to)
-    //     {
-    //         i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4);
-    //         return;
-    //     }
-
     ivec4 v;
     if (cast_type_from == 3)
     {
@@ -51,10 +45,18 @@ void main()
     }
     else
     {
-        v.r = i8buffer_ld1(bottom_blob_data, gi4.r);
-        v.g = i8buffer_ld1(bottom_blob_data, gi4.g);
-        v.b = i8buffer_ld1(bottom_blob_data, gi4.b);
-        v.a = i8buffer_ld1(bottom_blob_data, gi4.a);
+        if (cast_type_to == 3)
+        {
+            v.r = i8buffer_ld1(bottom_blob_data, gi4.r);
+            v.g = i8buffer_ld1(bottom_blob_data, gi4.g);
+            v.b = i8buffer_ld1(bottom_blob_data, gi4.b);
+            v.a = i8buffer_ld1(bottom_blob_data, gi4.a);
+        }
+        else
+        {
+            i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, ivec4(gi4));
+            return;
+        }
     }
 
     if (cast_type_to == 3)
diff --git a/src/layer/vulkan/shader/packing_pack4to1_int8.comp b/src/layer/vulkan/shader/packing_pack4to1_int8.comp
index c8b5b523b3e2..dd51546d1aa5 100644
--- a/src/layer/vulkan/shader/packing_pack4to1_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack4to1_int8.comp
@@ -35,12 +35,6 @@ void main()
 
     const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
 
-    //     if (cast_type_from == cast_type_to)
-    //     {
-    //         buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi);
-    //         return;
-    //     }
-
     ivec4 v;
     if (cast_type_from == 3)
     {
@@ -48,7 +42,15 @@ void main()
     }
     else
     {
-        v = i8buffer_ld4(bottom_blob_data, gi);
+        if (cast_type_to == 3)
+        {
+            v = i8buffer_ld4(bottom_blob_data, gi);
+        }
+        else
+        {
+            i8buffer_cp4to1(top_blob_data, ivec4(gi4), bottom_blob_data, int(gi));
+            return;
+        }
     }
 
     if (cast_type_to == 3)
diff --git a/src/layer/vulkan/shader/padding_3d_int8.comp b/src/layer/vulkan/shader/padding_3d_int8.comp
new file mode 100644
index 000000000000..cd2c31609c67
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_3d_int8.comp
@@ -0,0 +1,116 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int d = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outd = 0;
+layout(constant_id = shape_constant_id_offset + 10) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int d;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outd;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
+        return;
+
+    const int pad_value = int(value);
+
+    // if (psc(dims) == 4)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int yh = gy % psc(outh) - p.top;
+        int yd = gy / psc(outh) - p.front;
+        int y = yd * psc(h) + yh;
+        int z = gz;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && yh >= 0 && yh < psc(h) && yd >= 0 && yd < psc(d) && z >= 0 && z < psc(c))
+            {
+                int v_offset = z * psc(cstep) + y * psc(w) + x;
+                i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else if (per_channel_pad == 1)
+            {
+                i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            yh = clamp(yh, 0, psc(h) - 1);
+            yd = clamp(yd, 0, psc(d) - 1);
+            y = yd * psc(h) + yh;
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            yh = abs(yh);
+            yd = abs(yd);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            yh = (p.h - 1) - abs(yh - (p.h - 1));
+            yd = (p.d - 1) - abs(yd - (p.d - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             yh = (psc(h) - 1) - abs(yh - (psc(h) - 1));
+            //             yd = (psc(d) - 1) - abs(yd - (psc(d) - 1));
+            y = yd * psc(h) + yh;
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/padding_3d_pack4_int8.comp b/src/layer/vulkan/shader/padding_3d_pack4_int8.comp
new file mode 100644
index 000000000000..149f221f6ad3
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_3d_pack4_int8.comp
@@ -0,0 +1,116 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int d = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outd = 0;
+layout(constant_id = shape_constant_id_offset + 10) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int d;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outd;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
+        return;
+
+    const ivec4 pad_value = ivec4(int(value));
+
+    // if (psc(dims) == 4)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int yh = gy % psc(outh) - p.top;
+        int yd = gy / psc(outh) - p.front;
+        int y = yd * psc(h) + yh;
+        int z = gz;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && yh >= 0 && yh < psc(h) && yd >= 0 && yd < psc(d) && z >= 0 && z < psc(c))
+            {
+                int v_offset = z * psc(cstep) + y * psc(w) + x;
+                i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else if (per_channel_pad == 1)
+            {
+                i8buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz);
+            }
+            else
+            {
+                i8buffer_st4(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            yh = clamp(yh, 0, psc(h) - 1);
+            yd = clamp(yd, 0, psc(d) - 1);
+            y = yd * psc(h) + yh;
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            yh = abs(yh);
+            yd = abs(yd);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            yh = (p.h - 1) - abs(yh - (p.h - 1));
+            yd = (p.d - 1) - abs(yd - (p.d - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             yh = (psc(h) - 1) - abs(yh - (psc(h) - 1));
+            //             yd = (psc(d) - 1) - abs(yd - (psc(d) - 1));
+            y = yd * psc(h) + yh;
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/padding_int8.comp b/src/layer/vulkan/shader/padding_int8.comp
new file mode 100644
index 000000000000..ea195035fdaf
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_int8.comp
@@ -0,0 +1,182 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int pad_value = int(value);
+
+    if (psc(dims) == 1)
+    {
+        int x = gx - p.left;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w))
+            {
+                i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gx, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+
+            i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+
+            i8buffer_cp1(top_blob_data, gx, bottom_blob_data, x);
+        }
+    }
+    else if (psc(dims) == 2)
+    {
+        const int gi = gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
+            {
+                int v_offset = y * psc(w) + x;
+                i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+
+            int v_offset = y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+
+            int v_offset = y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+        int z = gz - p.front;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c))
+            {
+                int v_offset = z * psc(cstep) + y * psc(w) + x;
+                i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else if (per_channel_pad == 1)
+            {
+                i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+            z = clamp(z, 0, psc(c) - 1);
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            z = abs(z);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            z = (p.c - 1) - abs(z - (p.c - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+            //             z = (psc(c) - 1) - abs(z - (psc(c) - 1));
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/padding_pack1to4_int8.comp b/src/layer/vulkan/shader/padding_pack1to4_int8.comp
new file mode 100644
index 000000000000..e3629d898815
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_pack1to4_int8.comp
@@ -0,0 +1,185 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const ivec4 pad_value = ivec4(int(value));
+
+    if (psc(dims) == 1)
+    {
+        ivec4 x4 = gx * 4 - p.left + ivec4(0, 1, 2, 3);
+
+        if (type == 0)
+        {
+            bvec4 mask = bvec4(uvec4(greaterThanEqual(x4, ivec4(0))) & uvec4(lessThan(x4, ivec4(psc(w)))));
+
+            ivec4 v;
+            v.r = mask.r ? i8buffer_ld1(bottom_blob_data, x4.r) : pad_value.r;
+            v.g = mask.g ? i8buffer_ld1(bottom_blob_data, x4.g) : pad_value.g;
+            v.b = mask.b ? i8buffer_ld1(bottom_blob_data, x4.b) : pad_value.b;
+            v.a = mask.a ? i8buffer_ld1(bottom_blob_data, x4.a) : pad_value.a;
+
+            i8buffer_st4(top_blob_data, gx, v);
+        }
+        if (type == 1)
+        {
+            x4 = clamp(x4, 0, psc(w) - 1);
+
+            i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4);
+        }
+        if (type == 2)
+        {
+            x4 = abs(x4);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x4 = (p.w - 1) - abs(x4 - (p.w - 1));
+            //             x4 = (psc(w) - 1) - abs(x4 - (psc(w) - 1));
+
+            i8buffer_cp1to4(top_blob_data, gx, bottom_blob_data, x4);
+        }
+    }
+    else if (psc(dims) == 2)
+    {
+        const int gi = gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        ivec4 y4 = gy * 4 - p.top + ivec4(0, 1, 2, 3);
+
+        if (type == 0)
+        {
+            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w)) & (uvec4(greaterThanEqual(y4, ivec4(0))) & uvec4(lessThan(y4, ivec4(psc(h))))));
+
+            ivec4 v_offset = y4 * psc(w) + x;
+
+            ivec4 v;
+            v.r = mask.r ? i8buffer_ld1(bottom_blob_data, v_offset.r) : pad_value.r;
+            v.g = mask.g ? i8buffer_ld1(bottom_blob_data, v_offset.g) : pad_value.g;
+            v.b = mask.b ? i8buffer_ld1(bottom_blob_data, v_offset.b) : pad_value.b;
+            v.a = mask.a ? i8buffer_ld1(bottom_blob_data, v_offset.a) : pad_value.a;
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y4 = clamp(y4, 0, psc(h) - 1);
+
+            ivec4 v_offset = y4 * psc(w) + x;
+            i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y4 = abs(y4);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y4 = (p.h - 1) - abs(y4 - (p.h - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y4 = (psc(h) - 1) - abs(y4 - (psc(h) - 1));
+
+            ivec4 v_offset = y4 * psc(w) + x;
+            i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+        ivec4 z4 = gz * 4 - p.front + ivec4(0, 1, 2, 3);
+
+        if (type == 0)
+        {
+            bvec4 mask = bvec4(uvec4(x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) & (uvec4(greaterThanEqual(z4, ivec4(0))) & uvec4(lessThan(z4, ivec4(psc(c))))));
+
+            ivec4 pad_value4 = per_channel_pad == 1 ? i8buffer_ld4(per_channel_pad_blob_data, gz) : pad_value;
+
+            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
+
+            ivec4 v;
+            v.r = mask.r ? i8buffer_ld1(bottom_blob_data, v_offset.r) : pad_value4.r;
+            v.g = mask.g ? i8buffer_ld1(bottom_blob_data, v_offset.g) : pad_value4.g;
+            v.b = mask.b ? i8buffer_ld1(bottom_blob_data, v_offset.b) : pad_value4.b;
+            v.a = mask.a ? i8buffer_ld1(bottom_blob_data, v_offset.a) : pad_value4.a;
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+            z4 = clamp(z4, 0, psc(c) - 1);
+
+            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            z4 = abs(z4);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            z4 = (p.c - 1) - abs(z4 - (p.c - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+            //             z4 = (psc(c) - 1) - abs(z4 - (psc(c) - 1));
+
+            ivec4 v_offset = z4 * psc(cstep) + y * psc(w) + x;
+            i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/padding_pack4_int8.comp b/src/layer/vulkan/shader/padding_pack4_int8.comp
new file mode 100644
index 000000000000..a844f83ea3c7
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_pack4_int8.comp
@@ -0,0 +1,246 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8vec4 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const ivec4 pad_value = ivec4(int(value));
+
+    if (psc(dims) == 1)
+    {
+        int x = gx - p.left / 4;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w))
+            {
+                i8buffer_cp4(top_blob_data, gx, bottom_blob_data, x);
+            }
+            else
+            {
+                i8buffer_st4(top_blob_data, gx, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, x);
+
+            if (gx < p.left / 4)
+                v = ivec4(v.r);
+            else if (gx >= psc(w) + p.left / 4)
+                v = ivec4(v.a);
+
+            i8buffer_st4(top_blob_data, gx, v);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, x);
+
+            if (gx < p.left / 4)
+            {
+                ivec4 v0 = i8buffer_ld4(bottom_blob_data, x - 1);
+                v = ivec4(v.r, v0.a, v0.b, v0.g);
+            }
+            else if (gx >= psc(w) + p.left / 4)
+            {
+                ivec4 v1 = i8buffer_ld4(bottom_blob_data, x + 1);
+                v = ivec4(v1.b, v1.g, v1.r, v.a);
+            }
+
+            i8buffer_st4(top_blob_data, gx, v);
+        }
+    }
+    else if (psc(dims) == 2)
+    {
+        const int gi = gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top / 4;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
+            {
+                int v_offset = y * psc(w) + x;
+                i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else
+            {
+                i8buffer_st4(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+
+            int v_offset = y * psc(w) + x;
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+
+            if (gy < p.top / 4)
+                v = ivec4(v.r);
+            else if (gy >= psc(h) + p.top / 4)
+                v = ivec4(v.a);
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+
+            int v_offset = y * psc(w) + x;
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+
+            if (gy < p.top / 4)
+            {
+                ivec4 v0 = i8buffer_ld4(bottom_blob_data, v_offset - psc(w));
+                v = ivec4(v.r, v0.a, v0.b, v0.g);
+            }
+            else if (gy >= psc(h) + p.top / 4)
+            {
+                ivec4 v1 = i8buffer_ld4(bottom_blob_data, v_offset + psc(w));
+                v = ivec4(v1.b, v1.g, v1.r, v.a);
+            }
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+        int z = gz - p.front / 4;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c))
+            {
+                int v_offset = z * psc(cstep) + y * psc(w) + x;
+                i8buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+            }
+            else if (per_channel_pad == 1)
+            {
+                i8buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz);
+            }
+            else
+            {
+                i8buffer_st4(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+            z = clamp(z, 0, psc(c) - 1);
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+
+            if (gz < p.front / 4)
+                v = ivec4(v.r);
+            else if (gz >= psc(c) + p.front / 4)
+                v = ivec4(v.a);
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            z = abs(z);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            z = (p.c - 1) - abs(z - (p.c - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+            //             z = (psc(c) - 1) - abs(z - (psc(c) - 1));
+
+            int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+
+            if (gz < p.front / 4)
+            {
+                ivec4 v0 = i8buffer_ld4(bottom_blob_data, v_offset - psc(cstep));
+                v = ivec4(v.r, v0.a, v0.b, v0.g);
+            }
+            else if (gz >= psc(c) + p.front / 4)
+            {
+                ivec4 v1 = i8buffer_ld4(bottom_blob_data, v_offset + psc(cstep));
+                v = ivec4(v1.b, v1.g, v1.r, v.a);
+            }
+
+            i8buffer_st4(top_blob_data, gi, v);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/padding_pack4to1_int8.comp b/src/layer/vulkan/shader/padding_pack4to1_int8.comp
new file mode 100644
index 000000000000..062d430d361a
--- /dev/null
+++ b/src/layer/vulkan/shader/padding_pack4to1_int8.comp
@@ -0,0 +1,191 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int type = 1;
+layout(constant_id = 1) const float value = 0;
+layout(constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout(binding = 2) readonly buffer per_channel_pad_blob { sint8 per_channel_pad_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+    int front;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int pad_value = int(value);
+
+    if (psc(dims) == 1)
+    {
+        int x = gx - p.left;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) * 4)
+            {
+                ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4);
+                i8buffer_st1(top_blob_data, gx, v[x % 4]);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gx, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) * 4 - 1);
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4);
+            i8buffer_st1(top_blob_data, gx, v[x % 4]);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w * 4 - 1) - abs(x - (p.w * 4 - 1));
+            //             x = (psc(w) * 4 - 1) - abs(x - (psc(w) * 4 - 1));
+
+            ivec4 v = i8buffer_ld4(bottom_blob_data, x / 4);
+            i8buffer_st1(top_blob_data, gx, v[x % 4]);
+        }
+    }
+    else if (psc(dims) == 2)
+    {
+        const int gi = gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) * 4)
+            {
+                int v_offset = (y / 4) * psc(w) + x;
+                ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+                i8buffer_st1(top_blob_data, gi, v[y % 4]);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) * 4 - 1);
+
+            int v_offset = (y / 4) * psc(w) + x;
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+            i8buffer_st1(top_blob_data, gi, v[y % 4]);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h * 4 - 1) - abs(y - (p.h * 4 - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) * 4 - 1) - abs(y - (psc(h) * 4 - 1));
+
+            int v_offset = (y / 4) * psc(w) + x;
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+            i8buffer_st1(top_blob_data, gi, v[y % 4]);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        int x = gx - p.left;
+        int y = gy - p.top;
+        int z = gz - p.front;
+
+        if (type == 0)
+        {
+            if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c) * 4)
+            {
+                int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x;
+                ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+                i8buffer_st1(top_blob_data, gi, v[z % 4]);
+            }
+            else if (per_channel_pad == 1)
+            {
+                i8buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz);
+            }
+            else
+            {
+                i8buffer_st1(top_blob_data, gi, pad_value);
+            }
+        }
+        if (type == 1)
+        {
+            x = clamp(x, 0, psc(w) - 1);
+            y = clamp(y, 0, psc(h) - 1);
+            z = clamp(z, 0, psc(c) * 4 - 1);
+
+            int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x;
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+            i8buffer_st1(top_blob_data, gi, v[z % 4]);
+        }
+        if (type == 2)
+        {
+            x = abs(x);
+            y = abs(y);
+            z = abs(z);
+            // NOTE psc(X) get zeros on nvidia
+            // TODO only enable this workaround for some nvidia driver
+            x = (p.w - 1) - abs(x - (p.w - 1));
+            y = (p.h - 1) - abs(y - (p.h - 1));
+            z = (p.c * 4 - 1) - abs(z - (p.c * 4 - 1));
+            //             x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+            //             y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+            //             z = (psc(c) * 4 - 1) - abs(z - (psc(c) * 4 - 1));
+
+            int v_offset = (z / 4) * psc(cstep) + y * psc(w) + x;
+            ivec4 v = i8buffer_ld4(bottom_blob_data, v_offset);
+            i8buffer_st1(top_blob_data, gi, v[z % 4]);
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/quantize.comp b/src/layer/vulkan/shader/quantize.comp
index 65dc245f8917..39c30f02b81e 100644
--- a/src/layer/vulkan/shader/quantize.comp
+++ b/src/layer/vulkan/shader/quantize.comp
@@ -44,7 +44,8 @@ void main()
         scale = buffer_ld1(scale_blob_data, gy);
     }
 
-    int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f))));
+    v *= scale;
+    int v_int = float2int8(v);
 
     const uint outgi = gy * psc(out_stride) + gx;
 
diff --git a/src/layer/vulkan/shader/quantize_pack4.comp b/src/layer/vulkan/shader/quantize_pack4.comp
index 7ef6d3208547..7603e0758ed0 100644
--- a/src/layer/vulkan/shader/quantize_pack4.comp
+++ b/src/layer/vulkan/shader/quantize_pack4.comp
@@ -44,7 +44,8 @@ void main()
         scale = buffer_ld4(scale_blob_data, gy);
     }
 
-    ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f))));
+    v *= scale;
+    ivec4 v_int = float2int8vec4(v);
 
     const uint outgi = gy * psc(out_stride) + gx;
 
diff --git a/src/layer/vulkan/shader/requantize.comp b/src/layer/vulkan/shader/requantize.comp
index d129e26a81b1..c2eebf00351d 100644
--- a/src/layer/vulkan/shader/requantize.comp
+++ b/src/layer/vulkan/shader/requantize.comp
@@ -84,7 +84,8 @@ void main()
         scale_out = buffer_ld1(scale_out_blob_data, gy);
     }
 
-    int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));
+    v_fp *= scale_out;
+    int v_int = float2int8(v_fp);
 
     const uint outgi = gy * psc(out_stride) + gx;
 
diff --git a/src/layer/vulkan/shader/requantize_pack4.comp b/src/layer/vulkan/shader/requantize_pack4.comp
index 3638b6414ac7..5855c510af39 100644
--- a/src/layer/vulkan/shader/requantize_pack4.comp
+++ b/src/layer/vulkan/shader/requantize_pack4.comp
@@ -84,7 +84,8 @@ void main()
         scale_out = buffer_ld4(scale_out_blob_data, gy);
     }
 
-    ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));
+    v_fp *= scale_out;
+    ivec4 v_int = float2int8vec4(v_fp);
 
     const uint outgi = gy * psc(out_stride) + gx;
 
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 6ddabc7360bc..73a113aabdfd 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -523,6 +523,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
+        if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+        {
+            NCNN_LOGE("Convolution int8 1d input is not supported, please replace this layer with InnerProduct");
+            NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+            return -1;
+        }
+
 #if NCNN_BF16
         if (opt.use_bf16_storage && bottom_blob.elembits() == 16)
         {
@@ -538,16 +545,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     }
 #endif
 
-#if NCNN_BF16
-    if (opt.use_bf16_storage && bottom_blob.elembits() == 16)
-    {
-        return forward_bf16s(bottom_blob, top_blob, opt);
-    }
-#endif
-
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
@@ -589,6 +592,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         return 0;
     }
 
+#if NCNN_BF16
+    if (opt.use_bf16_storage && bottom_blob.elembits() == 16)
+    {
+        return forward_bf16s(bottom_blob, top_blob, opt);
+    }
+#endif
+
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
@@ -1366,6 +1376,9 @@ int Convolution_x86::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
     // flattened blob, implement as InnerProduct
     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
     {
+        NCNN_LOGE("Convolution 1d input compatibility path is deprecated and will be removed, please replace this layer with InnerProduct");
+        NCNN_LOGE("ncnn param suggestion: Convolution ... 0=%d 1=1 11=1 5=%d 6=%d 8=%d 9=%d 10=... -> InnerProduct ... 0=%d 1=%d 2=%d 8=%d 9=%d 10=...", num_output, bias_term, weight_data_size, int8_scale_term, activation_type, num_output, bias_term, weight_data_size, int8_scale_term, activation_type);
+
         Mat bottom_blob_3d;
         if (bottom_blob.elemsize % 16 == 0)
         {
diff --git a/src/net.cpp b/src/net.cpp
index 57e5a1241960..8c4d5d5fe80f 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1101,6 +1101,7 @@ int Net::load_param(const DataReader& dr)
         // sanitize use options
         if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
         if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+        if (!d->vkdev->info.support_int16_storage() || !d->vkdev->info.support_int16_arithmetic()) opt.use_int16_storage = false;
         if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
         if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
         if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
@@ -1279,12 +1280,6 @@ int Net::load_param(const DataReader& dr)
             return -1;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         Option opt1 = get_masked_option(opt, layer->featmask);
 
         if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
@@ -1480,6 +1475,7 @@ int Net::load_param_bin(const DataReader& dr)
         // sanitize use options
         if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
         if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+        if (!d->vkdev->info.support_int16_storage() || !d->vkdev->info.support_int16_arithmetic()) opt.use_int16_storage = false;
         if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
         if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
         if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
@@ -1641,12 +1637,6 @@ int Net::load_param_bin(const DataReader& dr)
             return -1;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         Option opt1 = get_masked_option(opt, layer->featmask);
 
         if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
diff --git a/src/option.cpp b/src/option.cpp
index 6e03ebc7f663..876b81cc349e 100644
--- a/src/option.cpp
+++ b/src/option.cpp
@@ -39,7 +39,7 @@ Option::Option()
     use_fp16_arithmetic = true;
     use_int8_packed = true;
     use_int8_storage = true;
-    use_int8_arithmetic = false;
+    use_int8_arithmetic = true;
 
     use_packing_layout = true;
 
@@ -70,8 +70,8 @@ Option::Option()
     use_fp16_uniform = true;
     use_int8_uniform = true;
 
-    use_reserved_9 = false;
-    use_reserved_10 = false;
+    use_int16_packed = true;
+    use_int16_storage = true;
     use_reserved_11 = false;
 }
 
diff --git a/src/option.h b/src/option.h
index b65cb579ec7f..4be2175a7576 100644
--- a/src/option.h
+++ b/src/option.h
@@ -145,8 +145,9 @@ class NCNN_EXPORT Option
     bool use_fp16_uniform;
     bool use_int8_uniform;
 
-    bool use_reserved_9;
-    bool use_reserved_10;
+    // enable int16 layout options for vulkan int8 shader intermediate data
+    bool use_int16_packed;
+    bool use_int16_storage;
     bool use_reserved_11;
 };
 
diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp
index ab9a121670b6..33f423c7957b 100644
--- a/src/pipelinecache.cpp
+++ b/src/pipelinecache.cpp
@@ -22,7 +22,7 @@ namespace ncnn {
 
 #if NCNN_VULKAN
 #define NCNN_PIPELINE_CACHE_FILE_MAGIC   0x5a545546
-#define NCNN_PIPELINE_CACHE_FILE_VERSION 1
+#define NCNN_PIPELINE_CACHE_FILE_VERSION 2
 #define NCNN_PIPELINE_CACHE_FILE_ENDIAN  0x12345678
 #define NCNN_PIPELINE_CACHE_NCNN_VERSION NCNN_VERSION_NUMBER
 
@@ -216,7 +216,9 @@ static uint32_t encode_spirv_cache_opt_bits(const Option& opt)
            | (uint32_t)opt.use_int8_uniform << 9
            | (uint32_t)opt.use_subgroup_ops << 10
            | (uint32_t)opt.use_shader_local_memory << 11
-           | (uint32_t)opt.use_cooperative_matrix << 12;
+           | (uint32_t)opt.use_cooperative_matrix << 12
+           | (uint32_t)opt.use_int16_packed << 13
+           | (uint32_t)opt.use_int16_storage << 14;
 }
 
 static bool can_cache_spirv(const VulkanDevice* vkdev, const Option& opt)
diff --git a/tests/test_c_api.cpp b/tests/test_c_api.cpp
index 643ed15e3d28..7c00801bc982 100644
--- a/tests/test_c_api.cpp
+++ b/tests/test_c_api.cpp
@@ -355,6 +355,8 @@ static int test_c_api_3()
     TEST_OPTION_SET_GET(use_int8_packed, 1, 0)
     TEST_OPTION_SET_GET(use_int8_storage, 1, 0)
     TEST_OPTION_SET_GET(use_int8_arithmetic, 1, 0)
+    TEST_OPTION_SET_GET(use_int16_packed, 1, 0)
+    TEST_OPTION_SET_GET(use_int16_storage, 1, 0)
     TEST_OPTION_SET_GET(use_bf16_packed, 1, 0)
     TEST_OPTION_SET_GET(use_bf16_storage, 1, 0)
 
diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
index 02680fc1e893..8cdd986a711d 100644
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -129,10 +129,14 @@ static int test_convolution_3()
 }
 
 #if NCNN_INT8
-static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false)
+static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false, int int8_scale_term = 0, bool sgemm = false, bool input_int8 = false)
 {
     ncnn::Mat a = RandomMat(w, h, c);
 
+    if (int8_scale_term == 0)
+        int8_scale_term = requant ? 101 : 1;
+    const bool use_requant = int8_scale_term > 100;
+
     ncnn::ParamDict pd;
     pd.set(0, outch);
     pd.set(1, kernel);
@@ -141,7 +145,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
     pd.set(4, pad);
     pd.set(5, bias);
     pd.set(6, outch * c * kernel * kernel);
-    pd.set(8, requant ? 101 : 1); // int8_scale_term
+    pd.set(8, int8_scale_term); // int8_scale_term
 
     int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
     ncnn::Mat activation_params(2);
@@ -155,7 +159,16 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
 
     ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
     ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
-    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+    ncnn::Mat top_scales = use_requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+
+    ncnn::Mat a_int8 = a;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = false;
+        ncnn::quantize_to_int8(a, a_int8, input_scales, opt);
+    }
 
     if (kernel == 3 && dilation == 1 && stride == 1)
     {
@@ -178,14 +191,36 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         weights[3] = top_scales;
     }
 
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, flag);
+    int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0;
+    int ret = 0;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_packed = false;
+        opt.use_bf16_storage = false;
+        opt.use_sgemm_convolution = sgemm;
+        opt.use_winograd_convolution = false;
+
+        ret = test_layer_opt("Convolution", pd, weights, opt, a_int8, use_requant ? 1.0f : 0.001f, flag);
+    }
+    else
+    {
+        ret = test_layer("Convolution", pd, weights, a_int8, use_requant ? 1.0f : 0.001f, flag);
+    }
     if (ret != 0)
     {
-        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d sgemm=%d input_int8=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, sgemm, input_int8, activation_type, activation_params[0], activation_params[1]);
         return ret;
     }
 
+    if (input_int8)
+        return ret;
+
     if (kernel == 3 && dilation == 1 && stride == 1)
     {
         ncnn::Option opt;
@@ -201,10 +236,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_winograd23_convolution = true;
         opt.use_winograd43_convolution = false;
 
-        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag);
         if (ret != 0)
         {
-            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]);
             return ret;
         }
     }
@@ -221,10 +256,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag);
         if (ret != 0)
         {
-            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]);
             return ret;
         }
     }
@@ -241,10 +276,10 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag);
         if (ret != 0)
         {
-            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]);
             return ret;
         }
     }
@@ -261,10 +296,31 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag);
+        if (ret != 0)
+        {
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]);
+            return ret;
+        }
+    }
+
+    if (sgemm)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_packed = false;
+        opt.use_bf16_storage = false;
+        opt.use_sgemm_convolution = true;
+        opt.use_winograd_convolution = false;
+
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, use_requant ? 1.0f : 0.001f, flag);
         if (ret != 0)
         {
-            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d int8_scale_term=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, int8_scale_term, activation_type, activation_params[0], activation_params[1]);
             return ret;
         }
     }
@@ -356,7 +412,22 @@ static int test_convolution_1()
            || test_convolution_int8(3, 9, 16, 13, 2, 2, 1, 0, 0)
            || test_convolution_int8(33, 5, 15, 5, 2, 1, 3, 0, 1)
            || test_convolution_int8(23, 11, 33, 28, 5, 1, 1, 0, 1)
-           || test_convolution_int8(3, 63, 2, 28, 2, 1, 2, 0, 0);
+           || test_convolution_int8(3, 63, 2, 28, 2, 1, 2, 0, 0)
+           || test_convolution_int8(7, 5, 4, 8, 1, 1, 1, 0, 1, false, 2)
+           || test_convolution_int8(7, 5, 4, 8, 1, 1, 1, 0, 1, true, 102)
+           || test_convolution_int8(9, 7, 8, 12, 2, 1, 2, 1, 1, false, 1, true)
+           || test_convolution_int8(9, 7, 8, 12, 2, 1, 2, 1, 1, true, 101, true);
+}
+
+static int test_convolution_1_int8_input()
+{
+    return 0
+           || test_convolution_int8(7, 5, 1, 1, 3, 1, 1, 1, 1, false, 1, false, true)
+           || test_convolution_int8(7, 5, 4, 4, 3, 1, 1, 1, 1, false, 1, false, true)
+           || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, false, 1, false, true)
+           || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, false, 2, false, true)
+           || test_convolution_int8(8, 6, 4, 8, 1, 1, 1, 0, 1, true, 102, false, true)
+           || test_convolution_int8(9, 7, 8, 8, 2, 1, 1, 1, 1, false, 1, true, true);
 }
 
 static int test_convolution_1_2()
@@ -443,6 +514,7 @@ int main()
 #if NCNN_INT8
     return 0
            || test_convolution_1()
+           || test_convolution_1_int8_input()
            || test_convolution_1_2()
            || test_convolution_2()
            || test_convolution_3();
diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp
index ae408ba9849b..85c52dc38d30 100644
--- a/tests/test_convolutiondepthwise_1.cpp
+++ b/tests/test_convolutiondepthwise_1.cpp
@@ -84,10 +84,14 @@ static int test_convolutiondepthwise_2()
 }
 
 #if NCNN_INT8
-static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false)
+static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false, int int8_scale_term = 0, bool input_int8 = false)
 {
     ncnn::Mat a = RandomMat(w, h, c);
 
+    if (int8_scale_term == 0)
+        int8_scale_term = requant ? 101 : 1;
+    const bool use_requant = int8_scale_term > 100;
+
     ncnn::ParamDict pd;
     pd.set(0, outch);
     pd.set(1, kernel);
@@ -97,7 +101,7 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke
     pd.set(5, bias);
     pd.set(6, outch / group * c / group * kernel * kernel * group);
     pd.set(7, group);
-    pd.set(8, requant ? 101 : 1); // int8_scale_term
+    pd.set(8, int8_scale_term); // int8_scale_term
 
     int activation_type = RAND() % 7; // 0 1 2 3 4 5 6
     ncnn::Mat activation_params(2);
@@ -108,9 +112,23 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke
 
     std::vector<ncnn::Mat> weights(bias ? 5 : 4);
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
-    ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group);
+    ncnn::Mat weight_scales;
+    if (int8_scale_term == 2 || int8_scale_term == 102)
+        weight_scales = scales_mat(weights[0], 1, weights[0].w, weights[0].w);
+    else
+        weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group);
     ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
-    ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+    ncnn::Mat top_scales = use_requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+
+    ncnn::Mat a_int8 = a;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = false;
+        ncnn::quantize_to_int8(a, a_int8, input_scales, opt);
+    }
+
     if (bias)
     {
         weights[1] = RandomMat(outch);
@@ -125,11 +143,28 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke
         weights[3] = top_scales;
     }
 
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, flag);
+    int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0;
+    int ret = 0;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_packed = false;
+        opt.use_bf16_storage = false;
+
+        ret = test_layer_opt("ConvolutionDepthWise", pd, weights, opt, a_int8, use_requant ? 1.0f : 0.001f, flag);
+    }
+    else
+    {
+        ret = test_layer("ConvolutionDepthWise", pd, weights, a_int8, use_requant ? 1.0f : 0.001f, flag);
+    }
     if (ret != 0)
     {
-        fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]);
+        fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d int8_scale_term=%d input_int8=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, int8_scale_term, input_int8, activation_type, activation_params[0], activation_params[1]);
     }
 
     return ret;
@@ -176,7 +211,9 @@ static int test_convolutiondepthwise_1()
                   || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4)
                   || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15)
                   || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2)
-                  || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16);
+                  || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16)
+                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8, false, 2)
+                  || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 2, false, 2);
 
         if (ret != 0)
             return -1;
@@ -202,7 +239,9 @@ static int test_convolutiondepthwise_1()
                   || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true)
                   || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true)
                   || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true)
-                  || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true);
+                  || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true)
+                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true, 102)
+                  || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 2, true, 102);
 
         if (ret != 0)
             return -1;
@@ -210,6 +249,17 @@ static int test_convolutiondepthwise_1()
 
     return 0;
 }
+
+static int test_convolutiondepthwise_1_int8_input()
+{
+    return 0
+           || test_convolutiondepthwise_int8(9, 7, 1, 1, 3, 1, 1, 1, 1, 1, false, 1, true)
+           || test_convolutiondepthwise_int8(9, 7, 4, 4, 3, 1, 1, 1, 1, 4, false, 1, true)
+           || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, false, 1, true)
+           || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, false, 2, true)
+           || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 8, true, 101, true)
+           || test_convolutiondepthwise_int8(9, 7, 8, 8, 3, 1, 1, 1, 1, 2, true, 102, true);
+}
 #endif // NCNN_INT8
 
 int main()
@@ -217,7 +267,7 @@ int main()
     SRAND(7767517);
 
 #if NCNN_INT8
-    return test_convolutiondepthwise_1() || test_convolutiondepthwise_2();
+    return test_convolutiondepthwise_1() || test_convolutiondepthwise_1_int8_input() || test_convolutiondepthwise_2();
 #else
     return test_convolutiondepthwise_2();
 #endif
diff --git a/tests/test_gemm_3.cpp b/tests/test_gemm_3.cpp
index f0224ca4786b..9a7eeeb6c759 100644
--- a/tests/test_gemm_3.cpp
+++ b/tests/test_gemm_3.cpp
@@ -218,7 +218,16 @@ static int test_gemm_int8(int M, int N, int K, float alpha, int transA, int tran
         RandomizeB(a[a.size() - 1], 10.f);
     }
 
-    int ret = test_layer("Gemm", pd, weights, a);
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_packing_layout = true;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_bf16_packed = false;
+    opt.use_bf16_storage = false;
+
+    int ret = test_layer_opt("Gemm", pd, weights, opt, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm_int8 failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_elemtype, output_transpose, constantA, constantB, output_N1M);
@@ -227,7 +236,7 @@ static int test_gemm_int8(int M, int N, int K, float alpha, int transA, int tran
     return ret;
 }
 
-static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float alpha, float beta, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int constantC)
+static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float alpha, float beta, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int constantC, bool use_bf16 = false, int output_N1M = 0)
 {
     int broadcast_type_C = 0;
     if (C.dims == 1 && C.w == 1)
@@ -274,6 +283,7 @@ static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float al
     pd.set(8, N);
     pd.set(9, K);
     pd.set(10, broadcast_type_C);
+    pd.set(11, output_N1M);
     // pd.set(12, 1);                  // output_elempack
     pd.set(13, output_elemtype);
     pd.set(14, output_transpose);
@@ -289,20 +299,29 @@ static int test_gemm_int8_bias(int M, int N, int K, const ncnn::Mat& C, float al
     std::vector<ncnn::Mat> a;
     if (!constantA)
     {
-        a.push_back(transA ? ncnn::Mat(M, K) : ncnn::Mat(K, M));
+        a.push_back(transA ? (output_N1M ? ncnn::Mat(M, 1, K) : ncnn::Mat(M, K)) : (output_N1M ? ncnn::Mat(K, 1, M) : ncnn::Mat(K, M)));
         RandomizeA(a[a.size() - 1], transA, 10.f);
     }
     if (!constantB)
     {
-        a.push_back(transB ? ncnn::Mat(K, N) : ncnn::Mat(N, K));
+        a.push_back(transB ? (output_N1M ? ncnn::Mat(K, 1, N) : ncnn::Mat(K, N)) : (output_N1M ? ncnn::Mat(N, 1, K) : ncnn::Mat(N, K)));
         RandomizeB(a[a.size() - 1], 10.f);
     }
     if (!constantC) a.push_back(C);
 
-    int ret = test_layer("Gemm", pd, weights, a);
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_packing_layout = true;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_bf16_packed = use_bf16;
+    opt.use_bf16_storage = use_bf16;
+
+    int ret = test_layer_opt("Gemm", pd, weights, opt, a);
     if (ret != 0)
     {
-        fprintf(stderr, "test_gemm_int8_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d constantC=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_elemtype, output_transpose, constantA, constantB, constantC);
+        fprintf(stderr, "test_gemm_int8_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d constantC=%d use_bf16=%d output_N1M=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_elemtype, output_transpose, constantA, constantB, constantC, use_bf16, output_N1M);
     }
 
     return ret;
@@ -366,6 +385,64 @@ static int test_gemm_int8_fp16s(int M, int N, int K, float alpha, int transA, in
     return 0;
 }
 
+static int test_gemm_int8_bf16s(int M, int N, int K, float alpha, int transA, int transB, int output_elemtype, int output_transpose, int constantA, int constantB, int output_N1M)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, alpha);
+    pd.set(1, 1.f); // beta
+    pd.set(2, transA);
+    pd.set(3, transB);
+    pd.set(4, constantA);
+    pd.set(5, constantB);
+    pd.set(6, 1);
+    pd.set(7, M);
+    pd.set(8, N);
+    pd.set(9, K);
+    pd.set(10, -1);
+    pd.set(11, output_N1M);
+    pd.set(13, output_elemtype);
+    pd.set(14, output_transpose);
+    pd.set(18, 2); // int8_scale_term
+
+    std::vector<ncnn::Mat> weights;
+    if (constantA) weights.push_back(transA ? RandomS8Mat(M, K) : RandomS8Mat(K, M));
+    if (constantB) weights.push_back(transB ? RandomS8Mat(K, N) : RandomS8Mat(N, K));
+    if (constantA) weights.push_back(RandomMat(M, 10.f, 20.f));
+    if (constantB) weights.push_back(RandomMat(1, 10.f, 20.f));
+
+    std::vector<ncnn::Mat> a;
+    if (!constantA)
+    {
+        a.push_back(transA ? (output_N1M ? ncnn::Mat(M, 1, K) : ncnn::Mat(M, K)) : (output_N1M ? ncnn::Mat(K, 1, M) : ncnn::Mat(K, M)));
+        RandomizeA(a[a.size() - 1], transA, 10.f);
+    }
+    if (!constantB)
+    {
+        a.push_back(transB ? (output_N1M ? ncnn::Mat(K, 1, N) : ncnn::Mat(K, N)) : (output_N1M ? ncnn::Mat(N, 1, K) : ncnn::Mat(N, K)));
+        RandomizeB(a[a.size() - 1], 10.f);
+    }
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_packing_layout = true;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_bf16_packed = true;
+    opt.use_bf16_storage = true;
+
+    float epsilon = 0.001;
+
+    int ret = test_layer_opt("Gemm", pd, weights, opt, a, 1, epsilon);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_gemm_int8_bf16s failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_elemtype=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_elemtype, output_transpose, constantA, constantB, output_N1M);
+        return ret;
+    }
+
+    return 0;
+}
+
 static int test_gemm_0(int M, int N, int K)
 {
     return 0
@@ -422,6 +499,9 @@ static int test_gemm_1(int M, int N, int K)
            || test_gemm_int8_bias(M, N, K, RandomMat(N), 0.8f, 1.f, 0, 0, 1, 1, 0, 0, 0)
            || test_gemm_int8_bias(M, N, K, RandomMat(N), 3.1f, -0.6f, 0, 1, 2, 0, 0, 0, 0)
            || test_gemm_int8_bias(M, N, K, RandomMat(N), 3.1f, -0.6f, 0, 1, 3, 1, 0, 0, 0)
+           || test_gemm_int8_bias(M, N, K, RandomMat(1), 1.7f, -0.4f, 0, 0, 0, 0, 0, 0, 0, false, 1)
+           || test_gemm_int8_bias(M, N, K, RandomMat(M), -1.3f, 0.6f, 1, 0, 0, 1, 0, 0, 0, false, 1)
+           || test_gemm_int8_bias(M, N, K, RandomMat(N, M), 0.8f, 0.5f, 0, 1, 0, 0, 0, 0, 0, false, 1)
 
            || test_gemm_int8_bias(M, N, K, RandomMat(1), -2.1f, 0.5f, 0, 0, 0, 0, 1, 1, 1)
            || test_gemm_int8_bias(M, N, K, RandomMat(1), -2.1f, 0.5f, 0, 0, 1, 1, 1, 1, 1)
@@ -504,6 +584,13 @@ int main()
                 return ret;
         }
     }
+
+    if (test_gemm_int8_bf16s(12, 23, 12, 1.f, 0, 1, 0, 0, 0, 0, 0)
+            || test_gemm_int8_bf16s(12, 23, 12, 1.f, 1, 0, 0, 1, 0, 0, 0)
+            || test_gemm_int8_bias(12, 23, 12, RandomMat(23), 1.7f, -0.4f, 0, 1, 0, 1, 0, 0, 0, true))
+    {
+        return -1;
+    }
 #else
     // test nothing for non-int8 build
 #endif
diff --git a/tests/test_gemm_4.cpp b/tests/test_gemm_4.cpp
index 911a03bbd6f2..8d350e671170 100644
--- a/tests/test_gemm_4.cpp
+++ b/tests/test_gemm_4.cpp
@@ -204,7 +204,16 @@ static int test_gemm_int8(int M, int N, int K, int TILE_M, int TILE_N, int TILE_
     RandomizeA(a[0], transA, 10.f);
     RandomizeB(a[1], 10.f);
 
-    int ret = test_layer("Gemm", pd, weights, a);
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_packing_layout = true;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_bf16_packed = false;
+    opt.use_bf16_storage = false;
+
+    int ret = test_layer_opt("Gemm", pd, weights, opt, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm_int8 failed M=%d N=%d K=%d TILE_M=%d TILE_N=%d TILE_K=%d alpha=%f transA=%d transB=%d output_transpose=%d\n", M, N, K, TILE_M, TILE_N, TILE_K, alpha, transA, transB, output_transpose);
diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp
index 921225308d88..66a2fbddda06 100644
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -86,7 +86,7 @@ static int test_innerproduct_3()
 }
 
 #if NCNN_INT8
-static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
+static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias, bool input_int8 = false, bool weight_int8 = false)
 {
     ncnn::ParamDict pd;
     pd.set(0, outch); // num_output
@@ -103,10 +103,19 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
 
     std::vector<ncnn::Mat> weights(bias ? 4 : 3);
     const int k = a.w * a.h * a.d * a.c;
-    weights[0] = RandomMat(outch * k);
-    ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k);
+    weights[0] = weight_int8 ? RandomS8Mat(outch * k) : RandomMat(outch * k);
+    ncnn::Mat weight_scales = weight_int8 ? RandomMat(outch, 10.f, 20.f) : scales_mat(weights[0], outch, k, k);
     ncnn::Mat input_scales = scales_mat(a, 1, k, k);
 
+    ncnn::Mat a_int8 = a;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = false;
+        ncnn::quantize_to_int8(a, a_int8, input_scales, opt);
+    }
+
     if (bias)
     {
         weights[1] = RandomMat(outch);
@@ -119,11 +128,28 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
         weights[2] = input_scales;
     }
 
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, flag);
+    int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0;
+    int ret = 0;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_packed = false;
+        opt.use_bf16_storage = false;
+
+        ret = test_layer_opt("InnerProduct", pd, weights, opt, a_int8, 0.001f, flag);
+    }
+    else
+    {
+        ret = test_layer("InnerProduct", pd, weights, a_int8, 0.001f, flag);
+    }
     if (ret != 0)
     {
-        fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.d, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
+        fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d %d) outch=%d bias=%d input_int8=%d weight_int8=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.d, a.c, outch, bias, input_int8, weight_int8, activation_type, activation_params[0], activation_params[1]);
     }
 
     return ret;
@@ -142,7 +168,14 @@ static int test_innerproduct_4()
            || test_innerproduct_int8(RandomMat(6, 2, 8), 8, 1)
            || test_innerproduct_int8(RandomMat(8, 3, 15), 15, 1)
            || test_innerproduct_int8(RandomMat(7, 2, 16), 4, 1)
-           || test_innerproduct_int8(RandomMat(6, 3, 16), 16, 1);
+           || test_innerproduct_int8(RandomMat(6, 3, 16), 16, 1)
+           || test_innerproduct_int8(RandomMat(16), 16, 1, true)
+           || test_innerproduct_int8(RandomMat(32), 16, 1, false, true)
+           || test_innerproduct_int8(RandomMat(16), 12, 1, true, true)
+           || test_innerproduct_int8(RandomMat(2, 2, 1), 7, 1, true)
+           || test_innerproduct_int8(RandomMat(2, 2, 2), 7, 1, true)
+           || test_innerproduct_int8(RandomMat(2, 2, 3), 7, 1, true)
+           || test_innerproduct_int8(RandomMat(2, 2, 4), 8, 1, true);
 }
 #endif // NCNN_INT8
 
@@ -205,7 +238,7 @@ static int test_innerproduct_5()
 }
 
 #if NCNN_INT8
-static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
+static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias, bool input_int8 = false, bool weight_int8 = false)
 {
     ncnn::ParamDict pd;
     pd.set(0, outch);
@@ -215,10 +248,19 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
 
     std::vector<ncnn::Mat> weights(bias ? 4 : 3);
     const int k = a.w;
-    weights[0] = RandomMat(outch * k);
-    ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k);
+    weights[0] = weight_int8 ? RandomS8Mat(outch * k) : RandomMat(outch * k);
+    ncnn::Mat weight_scales = weight_int8 ? RandomMat(outch, 10.f, 20.f) : scales_mat(weights[0], outch, k, k);
     ncnn::Mat input_scales = scales_mat(a, 1, k, k);
 
+    ncnn::Mat a_int8 = a;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = false;
+        ncnn::quantize_to_int8(a, a_int8, input_scales, opt);
+    }
+
     if (bias)
     {
         weights[1] = RandomMat(outch);
@@ -231,11 +273,28 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
         weights[2] = input_scales;
     }
 
-    int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, flag);
+    int flag = input_int8 ? TEST_LAYER_DISABLE_AUTO_INPUT_CASTING : 0;
+    int ret = 0;
+    if (input_int8)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_packed = false;
+        opt.use_bf16_storage = false;
+
+        ret = test_layer_opt("InnerProduct", pd, weights, opt, a_int8, 0.001f, flag);
+    }
+    else
+    {
+        ret = test_layer("InnerProduct", pd, weights, a_int8, 0.001f, flag);
+    }
     if (ret != 0)
     {
-        fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d\n", a.dims, a.w, a.h, a.c, outch, bias);
+        fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d input_int8=%d weight_int8=%d\n", a.dims, a.w, a.h, a.c, outch, bias, input_int8, weight_int8);
     }
 
     return ret;
@@ -252,7 +311,11 @@ static int test_innerproduct_6()
            || test_innerproduct_gemm_int8(RandomMat(16, 12), 16, 0)
            || test_innerproduct_gemm_int8(RandomMat(4, 15), 8, 1)
            || test_innerproduct_gemm_int8(RandomMat(6, 16), 16, 0)
-           || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1);
+           || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1)
+           || test_innerproduct_gemm_int8(RandomMat(11, 16), 8, 1, false, true)
+           || test_innerproduct_gemm_int8(RandomMat(13, 15), 7, 1, true)
+           || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1, true)
+           || test_innerproduct_gemm_int8(RandomMat(12, 16), 7, 1, true, true);
 }
 
 static int test_innerproduct_7()
diff --git a/tests/test_padding.cpp b/tests/test_padding.cpp
index 6d7c07a174cc..4e4190d5780a 100644
--- a/tests/test_padding.cpp
+++ b/tests/test_padding.cpp
@@ -237,7 +237,7 @@ static int test_padding_int8(const ncnn::Mat& a, int top, int bottom, int left,
     if (per_channel_pad_data_size)
         weights[0] = RandomMat(per_channel_pad_data_size);
 
-    int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING;
+    int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING;
     int ret = test_layer("Padding", pd, weights, a, 0.001, flag);
     if (ret != 0)
     {