| 7| 6| 5| 4| 3| 2| 1| 0| q0 | 11| 12| 13| 14| 15| 16| 17| 18| q1 |206|205|204|203|202|201|200|199|
after VTRN instruction (vtrnq_u16) | 7| 6| 5| 4| 3| 2| 1| 0| q0 |205| 12|203| 14|201| 16|199| 18| q1 |206| 11|204| 13|202| 15|200| 17|
| 3| 2| 1| 0| d0 |207| 11|100|999| d1 |206| 12|101|998| d2 |205| 13|102|997| d3 |204| 14|103|996|
after VTRN d0 d1 | 3| 2| 1| 0| d0 | 12| 11|998|999| d1 |206|207|101|100| d2 |205| 13|102|997| d3 |204| 14|103|996|
after VTRN d2 d3 | 3| 2| 1| 0| d0 | 12| 11|998|999| d1 |206|207|101|100| d2 | 14| 13|996|997| d3 |204|205|103|102|
after VTRN q0 q1 | 3| 2| 1| 0| d0 |996|997|998|999| d1 |103|102|101|100| d2 | 14| 13| 12| 11| d3 |204|205|206|207|
#geshi(asm){{ vtrn.32 q0, q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 }}
| 3| 2| 1| 0| q0 | 0.1| 11.0|100.0|999.0| q1 | 0.2| 12.0|101.0|998.0| q2 | 0.3| 13.0|102.0|997.0| q3 | 0.4| 14.0|103.0|996.0|
after VTRN.32 q0 q1 | 3| 2| 1| 0| q0 | 12.0| 11.0|998.0|999.0| q1 | 0.2| 0.1|101.0|100.0| q2 | 0.3| 13.0|102.0|997.0| q3 | 0.4| 14.0|103.0|996.0|
after VTRN.32 q2 q3 | 3| 2| 1| 0| q0 | 12.0| 11.0|998.0|999.0| q1 | 0.2| 0.1|101.0|100.0| q2 | 14.0| 13.0|996.0|997.0| q3 | 0.4| 0.3|103.0|102.0|
after VSWP d1 d4 | 3| 2| 1| 0| q0 |996.0|997.0|998.0|999.0| q1 | 0.2| 0.1|101.0|100.0| q2 | 14.0| 13.0| 12.0| 11.0| q3 | 0.4| 0.3|103.0|102.0|
after VSWP d3 d6 | 3| 2| 1| 0| q0 |996.0|997.0|998.0|999.0| q1 |103.0|102.0|101.0|100.0| q2 | 14.0| 13.0| 12.0| 11.0| q3 | 0.4| 0.3| 0.2| 0.1|
#geshi(c++){{ asm( "VMOV q0 %4\n\t" // Copy input to q0 "VMOV q1 %5\n\t" // Copy input to q1 "VMOV q2 %6\n\t" // Copy input to q2 "VMOV q3 %7\n\t" // Copy input to q3 "vtrn.32 q0, q1\n\t" "vtrn.32 q2, q3\n\t" "vswp d1, d4\n\t" "vswp d3, d6\n\t" "VMOV %0 q0\n\t" // write back q0 "VMOV %1 q1\n\t" // write back q1 "VMOV %2 q2\n\t" // write back q2 "VMOV %3 q3\n\t" // write back q3
: "=r" (transposed_v0),"=r" (transposed_v1),"=r" (transposed_v2),"=r" (transposed_v3) : "r" (src_v0),"r" (src_v1),"r" (src_v2),"r" (src_v3) : "q0","q1","q2","q3"); }}
#geshi(c++){{ float32x4_t _v0 = vcvtq_f32_u32(vld1_u8(src_v0)); float32x4_t _v1 = vcvtq_f32_u32(vld1_u8(src_v1)); float32x4_t _v2 = vcvtq_f32_u32(vld1_u8(src_v2)); float32x4_t _v3 = vcvtq_f32_u32(vld1_u8(src_v3));
float32x4x2_t v01 = vtrnq_f32(_v0, _v1); float32x4x2_t v23 = vtrnq_f32(_v2, _v3);
float32x4_t _dst0 = vcombine_f32(vget_low_f32(v01.val[0]), vget_low_f32(v23.val[0])); float32x4_t _dst1 = vcombine_f32(vget_low_f32(v01.val[1]), vget_low_f32(v23.val[1])); float32x4_t _dst2 = vcombine_f32(vget_high_f32(v01.val[0]), vget_high_f32(v23.val[0])); float32x4_t _dst3 = vcombine_f32(vget_high_f32(v01.val[1]), vget_high_f32(v23.val[1])); }}