00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_q15.c 00009 * 00010 * Description: Convolution of Q15 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00068 void arm_conv_q15( 00069 q15_t * pSrcA, 00070 uint32_t srcALen, 00071 q15_t * pSrcB, 00072 uint32_t srcBLen, 00073 q15_t * pDst) 00074 { 00075 00076 #ifndef ARM_MATH_CM0 00077 00078 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00079 00080 q15_t *pIn1; /* inputA pointer */ 00081 q15_t *pIn2; /* inputB pointer */ 00082 q15_t *pOut = pDst; /* output pointer */ 00083 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00084 q15_t *px; /* Intermediate inputA pointer */ 00085 q15_t *py; /* Intermediate inputB pointer */ 00086 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00087 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00088 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ 00089 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00090 00091 00092 /* The algorithm implementation is based on the lengths of the inputs. */ 00093 /* srcB is always made to slide across srcA. */ 00094 /* So srcBLen is always considered as shorter or equal to srcALen */ 00095 if(srcALen >= srcBLen) 00096 { 00097 /* Initialization of inputA pointer */ 00098 pIn1 = pSrcA; 00099 00100 /* Initialization of inputB pointer */ 00101 pIn2 = pSrcB; 00102 } 00103 else 00104 { 00105 /* Initialization of inputA pointer */ 00106 pIn1 = pSrcB; 00107 00108 /* Initialization of inputB pointer */ 00109 pIn2 = pSrcA; 00110 00111 /* srcBLen is always considered as shorter or equal to srcALen */ 00112 j = srcBLen; 00113 srcBLen = srcALen; 00114 srcALen = j; 00115 } 00116 00117 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00118 /* The function is internally 00119 * divided into three stages according to the number of multiplications that has to be 00120 * taken place between inputA samples and inputB samples. In the first stage of the 00121 * algorithm, the multiplications increase by one for every iteration. 00122 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00123 * In the third stage of the algorithm, the multiplications decrease by one 00124 * for every iteration. */ 00125 00126 /* The algorithm is implemented in three stages. 00127 The loop counters of each stage is initiated here. */ 00128 blockSize1 = srcBLen - 1u; 00129 blockSize2 = srcALen - (srcBLen - 1u); 00130 00131 /* -------------------------- 00132 * Initializations of stage1 00133 * -------------------------*/ 00134 00135 /* sum = x[0] * y[0] 00136 * sum = x[0] * y[1] + x[1] * y[0] 00137 * .... 00138 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00139 */ 00140 00141 /* In this stage the MAC operations are increased by 1 for every iteration. 00142 The count variable holds the number of MAC operations performed */ 00143 count = 1u; 00144 00145 /* Working pointer of inputA */ 00146 px = pIn1; 00147 00148 /* Working pointer of inputB */ 00149 py = pIn2; 00150 00151 00152 /* ------------------------ 00153 * Stage1 process 00154 * ----------------------*/ 00155 00156 /* For loop unrolling by 4, this stage is divided into two. */ 00157 /* First part of this stage computes the MAC operations less than 4 */ 00158 /* Second part of this stage computes the MAC operations greater than or equal to 4 */ 00159 00160 /* The first part of the stage starts here */ 00161 while((count < 4u) && (blockSize1 > 0u)) 00162 { 00163 /* Accumulator is made zero for every iteration */ 00164 sum = 0; 00165 00166 /* Loop over number of MAC operations between 00167 * inputA samples and inputB samples */ 00168 k = count; 00169 00170 while(k > 0u) 00171 { 00172 /* Perform the multiply-accumulates */ 00173 sum = __SMLALD(*px++, *py--, sum); 00174 00175 /* Decrement the loop counter */ 00176 k--; 00177 } 00178 00179 /* Store the result in the accumulator in the destination buffer. */ 00180 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00181 00182 /* Update the inputA and inputB pointers for next MAC calculation */ 00183 py = pIn2 + count; 00184 px = pIn1; 00185 00186 /* Increment the MAC count */ 00187 count++; 00188 00189 /* Decrement the loop counter */ 00190 blockSize1--; 00191 } 00192 00193 /* The second part of the stage starts here */ 00194 /* The internal loop, over count, is unrolled by 4 */ 00195 /* To, read the last two inputB samples using SIMD: 00196 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ 00197 py = py - 1; 00198 00199 while(blockSize1 > 0u) 00200 { 00201 /* Accumulator is made zero for every iteration */ 00202 sum = 0; 00203 00204 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00205 k = count >> 2u; 00206 00207 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00208 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00209 while(k > 0u) 00210 { 00211 /* Perform the multiply-accumulates */ 00212 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00213 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00214 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00215 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00216 00217 /* Decrement the loop counter */ 00218 k--; 00219 } 00220 00221 /* For the next MAC operations, the pointer py is used without SIMD 00222 * So, py is incremented by 1 */ 00223 py = py + 1u; 00224 00225 /* If the count is not a multiple of 4, compute any remaining MACs here. 00226 ** No loop unrolling is used. */ 00227 k = count % 0x4u; 00228 00229 while(k > 0u) 00230 { 00231 /* Perform the multiply-accumulates */ 00232 sum = __SMLALD(*px++, *py--, sum); 00233 00234 /* Decrement the loop counter */ 00235 k--; 00236 } 00237 00238 /* Store the result in the accumulator in the destination buffer. */ 00239 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00240 00241 /* Update the inputA and inputB pointers for next MAC calculation */ 00242 py = pIn2 + (count - 1u); 00243 px = pIn1; 00244 00245 /* Increment the MAC count */ 00246 count++; 00247 00248 /* Decrement the loop counter */ 00249 blockSize1--; 00250 } 00251 00252 /* -------------------------- 00253 * Initializations of stage2 00254 * ------------------------*/ 00255 00256 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00257 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00258 * .... 00259 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00260 */ 00261 00262 /* Working pointer of inputA */ 00263 px = pIn1; 00264 00265 /* Working pointer of inputB */ 00266 pSrc2 = pIn2 + (srcBLen - 1u); 00267 py = pSrc2; 00268 00269 /* Initialize inputB pointer of type q31 */ 00270 pb = (q31_t *) (py - 1u); 00271 00272 /* count is the index by which the pointer pIn1 to be incremented */ 00273 count = 1u; 00274 00275 00276 /* -------------------- 00277 * Stage2 process 00278 * -------------------*/ 00279 00280 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00281 * So, to loop unroll over blockSize2, 00282 * srcBLen should be greater than or equal to 4 */ 00283 if(srcBLen >= 4u) 00284 { 00285 /* Loop unroll over blockSize2, by 4 */ 00286 blkCnt = blockSize2 >> 2u; 00287 00288 while(blkCnt > 0u) 00289 { 00290 /* Set all accumulators to zero */ 00291 acc0 = 0; 00292 acc1 = 0; 00293 acc2 = 0; 00294 acc3 = 0; 00295 00296 00297 /* read x[0], x[1] samples */ 00298 x0 = *(q31_t *) (px++); 00299 /* read x[1], x[2] samples */ 00300 x1 = *(q31_t *) (px++); 00301 00302 00303 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00304 k = srcBLen >> 2u; 00305 00306 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00307 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00308 do 00309 { 00310 /* Read the last two inputB samples using SIMD: 00311 * y[srcBLen - 1] and y[srcBLen - 2] */ 00312 c0 = *(pb--); 00313 00314 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00315 acc0 = __SMLALDX(x0, c0, acc0); 00316 00317 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00318 acc1 = __SMLALDX(x1, c0, acc1); 00319 00320 /* Read x[2], x[3] */ 00321 x2 = *(q31_t *) (px++); 00322 00323 /* Read x[3], x[4] */ 00324 x3 = *(q31_t *) (px++); 00325 00326 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00327 acc2 = __SMLALDX(x2, c0, acc2); 00328 00329 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00330 acc3 = __SMLALDX(x3, c0, acc3); 00331 00332 /* Read y[srcBLen - 3] and y[srcBLen - 4] */ 00333 c0 = *(pb--); 00334 00335 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00336 acc0 = __SMLALDX(x2, c0, acc0); 00337 00338 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00339 acc1 = __SMLALDX(x3, c0, acc1); 00340 00341 /* Read x[4], x[5] */ 00342 x0 = *(q31_t *) (px++); 00343 00344 /* Read x[5], x[6] */ 00345 x1 = *(q31_t *) (px++); 00346 00347 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00348 acc2 = __SMLALDX(x0, c0, acc2); 00349 00350 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00351 acc3 = __SMLALDX(x1, c0, acc3); 00352 00353 } while(--k); 00354 00355 /* For the next MAC operations, SIMD is not used 00356 * So, the 16 bit pointer if inputB, py is updated */ 00357 py = (q15_t *) pb; 00358 py = py + 1; 00359 00360 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00361 ** No loop unrolling is used. */ 00362 k = srcBLen % 0x4u; 00363 00364 if(k == 1u) 00365 { 00366 /* Read y[srcBLen - 5] */ 00367 c0 = *(py); 00368 00369 #ifdef ARM_MATH_BIG_ENDIAN 00370 00371 c0 = c0 << 16u; 00372 00373 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00374 00375 /* Read x[7] */ 00376 x3 = *(q31_t *) px++; 00377 00378 /* Perform the multiply-accumulates */ 00379 acc0 = __SMLALD(x0, c0, acc0); 00380 acc1 = __SMLALD(x1, c0, acc1); 00381 acc2 = __SMLALDX(x1, c0, acc2); 00382 acc3 = __SMLALDX(x3, c0, acc3); 00383 } 00384 00385 if(k == 2u) 00386 { 00387 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00388 c0 = *(pb); 00389 00390 /* Read x[7], x[8] */ 00391 x3 = *(q31_t *) px++; 00392 00393 /* Read x[9] */ 00394 x2 = *(q31_t *) px++; 00395 00396 /* Perform the multiply-accumulates */ 00397 acc0 = __SMLALDX(x0, c0, acc0); 00398 acc1 = __SMLALDX(x1, c0, acc1); 00399 acc2 = __SMLALDX(x3, c0, acc2); 00400 acc3 = __SMLALDX(x2, c0, acc3); 00401 } 00402 00403 if(k == 3u) 00404 { 00405 /* Read y[srcBLen - 5], y[srcBLen - 6] */ 00406 c0 = *pb--; 00407 00408 /* Read x[7], x[8] */ 00409 x3 = *(q31_t *) px++; 00410 00411 /* Read x[9] */ 00412 x2 = *(q31_t *) px++; 00413 00414 /* Perform the multiply-accumulates */ 00415 acc0 = __SMLALDX(x0, c0, acc0); 00416 acc1 = __SMLALDX(x1, c0, acc1); 00417 acc2 = __SMLALDX(x3, c0, acc2); 00418 acc3 = __SMLALDX(x2, c0, acc3); 00419 00420 #ifdef ARM_MATH_BIG_ENDIAN 00421 00422 /* Read y[srcBLen - 7] */ 00423 c0 = (*pb); 00424 00425 //c0 = (c0 & 0x0000FFFF)<<16; 00426 c0 = (c0) << 16; 00427 00428 #else 00429 00430 /* Read y[srcBLen - 7] */ 00431 c0 = (q15_t) (*pb >> 16); 00432 00433 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */ 00434 00435 /* Read x[10] */ 00436 x3 = *(q31_t *) px++; 00437 00438 /* Perform the multiply-accumulates */ 00439 acc0 = __SMLALDX(x1, c0, acc0); 00440 acc1 = __SMLALD(x2, c0, acc1); 00441 acc2 = __SMLALDX(x2, c0, acc2); 00442 acc3 = __SMLALDX(x3, c0, acc3); 00443 } 00444 00445 00446 /* Store the results in the accumulators in the destination buffer. */ 00447 00448 #ifndef ARM_MATH_BIG_ENDIAN 00449 00450 *__SIMD32(pOut)++ = 00451 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00452 *__SIMD32(pOut)++ = 00453 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00454 00455 #else 00456 00457 *__SIMD32(pOut)++ = 00458 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00459 *__SIMD32(pOut)++ = 00460 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00461 00462 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00463 00464 /* Update the inputA and inputB pointers for next MAC calculation */ 00465 px = pIn1 + (count * 4u); 00466 py = pSrc2; 00467 pb = (q31_t *) (py - 1); 00468 00469 /* Increment the pointer pIn1 index, count by 1 */ 00470 count++; 00471 00472 /* Decrement the loop counter */ 00473 blkCnt--; 00474 } 00475 00476 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00477 ** No loop unrolling is used. */ 00478 blkCnt = blockSize2 % 0x4u; 00479 00480 while(blkCnt > 0u) 00481 { 00482 /* Accumulator is made zero for every iteration */ 00483 sum = 0; 00484 00485 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00486 k = srcBLen >> 2u; 00487 00488 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00489 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00490 while(k > 0u) 00491 { 00492 /* Perform the multiply-accumulates */ 00493 sum += (q63_t) ((q31_t) * px++ * *py--); 00494 sum += (q63_t) ((q31_t) * px++ * *py--); 00495 sum += (q63_t) ((q31_t) * px++ * *py--); 00496 sum += (q63_t) ((q31_t) * px++ * *py--); 00497 00498 /* Decrement the loop counter */ 00499 k--; 00500 } 00501 00502 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00503 ** No loop unrolling is used. */ 00504 k = srcBLen % 0x4u; 00505 00506 while(k > 0u) 00507 { 00508 /* Perform the multiply-accumulates */ 00509 sum += (q63_t) ((q31_t) * px++ * *py--); 00510 00511 /* Decrement the loop counter */ 00512 k--; 00513 } 00514 00515 /* Store the result in the accumulator in the destination buffer. */ 00516 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00517 00518 /* Update the inputA and inputB pointers for next MAC calculation */ 00519 px = pIn1 + count; 00520 py = pSrc2; 00521 00522 /* Increment the pointer pIn1 index, count by 1 */ 00523 count++; 00524 00525 /* Decrement the loop counter */ 00526 blkCnt--; 00527 } 00528 } 00529 else 00530 { 00531 /* If the srcBLen is not a multiple of 4, 00532 * the blockSize2 loop cannot be unrolled by 4 */ 00533 blkCnt = blockSize2; 00534 00535 while(blkCnt > 0u) 00536 { 00537 /* Accumulator is made zero for every iteration */ 00538 sum = 0; 00539 00540 /* srcBLen number of MACS should be performed */ 00541 k = srcBLen; 00542 00543 while(k > 0u) 00544 { 00545 /* Perform the multiply-accumulate */ 00546 sum += (q63_t) ((q31_t) * px++ * *py--); 00547 00548 /* Decrement the loop counter */ 00549 k--; 00550 } 00551 00552 /* Store the result in the accumulator in the destination buffer. */ 00553 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); 00554 00555 /* Update the inputA and inputB pointers for next MAC calculation */ 00556 px = pIn1 + count; 00557 py = pSrc2; 00558 00559 /* Increment the MAC count */ 00560 count++; 00561 00562 /* Decrement the loop counter */ 00563 blkCnt--; 00564 } 00565 } 00566 00567 00568 /* -------------------------- 00569 * Initializations of stage3 00570 * -------------------------*/ 00571 00572 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00573 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00574 * .... 00575 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00576 * sum += x[srcALen-1] * y[srcBLen-1] 00577 */ 00578 00579 /* In this stage the MAC operations are decreased by 1 for every iteration. 00580 The blockSize3 variable holds the number of MAC operations performed */ 00581 00582 blockSize3 = srcBLen - 1u; 00583 00584 /* Working pointer of inputA */ 00585 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00586 px = pSrc1; 00587 00588 /* Working pointer of inputB */ 00589 pSrc2 = pIn2 + (srcBLen - 1u); 00590 pIn2 = pSrc2 - 1u; 00591 py = pIn2; 00592 00593 /* ------------------- 00594 * Stage3 process 00595 * ------------------*/ 00596 00597 /* For loop unrolling by 4, this stage is divided into two. */ 00598 /* First part of this stage computes the MAC operations greater than 4 */ 00599 /* Second part of this stage computes the MAC operations less than or equal to 4 */ 00600 00601 /* The first part of the stage starts here */ 00602 j = blockSize3 >> 2u; 00603 00604 while((j > 0u) && (blockSize3 > 0u)) 00605 { 00606 /* Accumulator is made zero for every iteration */ 00607 sum = 0; 00608 00609 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00610 k = blockSize3 >> 2u; 00611 00612 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00613 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00614 while(k > 0u) 00615 { 00616 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied 00617 * with y[srcBLen - 1], y[srcBLen - 2] respectively */ 00618 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00619 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied 00620 * with y[srcBLen - 3], y[srcBLen - 4] respectively */ 00621 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); 00622 00623 /* Decrement the loop counter */ 00624 k--; 00625 } 00626 00627 /* For the next MAC operations, the pointer py is used without SIMD 00628 * So, py is incremented by 1 */ 00629 py = py + 1u; 00630 00631 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00632 ** No loop unrolling is used. */ 00633 k = blockSize3 % 0x4u; 00634 00635 while(k > 0u) 00636 { 00637 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ 00638 sum = __SMLALD(*px++, *py--, sum); 00639 00640 /* Decrement the loop counter */ 00641 k--; 00642 } 00643 00644 /* Store the result in the accumulator in the destination buffer. */ 00645 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00646 00647 /* Update the inputA and inputB pointers for next MAC calculation */ 00648 px = ++pSrc1; 00649 py = pIn2; 00650 00651 /* Decrement the loop counter */ 00652 blockSize3--; 00653 00654 j--; 00655 } 00656 00657 /* The second part of the stage starts here */ 00658 /* SIMD is not used for the next MAC operations, 00659 * so pointer py is updated to read only one sample at a time */ 00660 py = py + 1u; 00661 00662 while(blockSize3 > 0u) 00663 { 00664 /* Accumulator is made zero for every iteration */ 00665 sum = 0; 00666 00667 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00668 k = blockSize3; 00669 00670 while(k > 0u) 00671 { 00672 /* Perform the multiply-accumulates */ 00673 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00674 sum = __SMLALD(*px++, *py--, sum); 00675 00676 /* Decrement the loop counter */ 00677 k--; 00678 } 00679 00680 /* Store the result in the accumulator in the destination buffer. */ 00681 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); 00682 00683 /* Update the inputA and inputB pointers for next MAC calculation */ 00684 px = ++pSrc1; 00685 py = pSrc2; 00686 00687 /* Decrement the loop counter */ 00688 blockSize3--; 00689 } 00690 00691 #else 00692 00693 /* Run the below code for Cortex-M0 */ 00694 00695 q15_t *pIn1 = pSrcA; /* input pointer */ 00696 q15_t *pIn2 = pSrcB; /* coefficient pointer */ 00697 q63_t sum; /* Accumulator */ 00698 uint32_t i, j; /* loop counter */ 00699 00700 /* Loop to calculate output of convolution for output length number of times */ 00701 for (i = 0; i < (srcALen + srcBLen - 1); i++) 00702 { 00703 /* Initialize sum with zero to carry on MAC operations */ 00704 sum = 0; 00705 00706 /* Loop to perform MAC operations according to convolution equation */ 00707 for (j = 0; j <= i; j++) 00708 { 00709 /* Check the array limitations */ 00710 if(((i - j) < srcBLen) && (j < srcALen)) 00711 { 00712 /* z[i] += x[i-j] * y[j] */ 00713 sum += (q31_t) pIn1[j] * (pIn2[i - j]); 00714 } 00715 } 00716 00717 /* Store the output in the destination buffer */ 00718 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); 00719 } 00720 00721 #endif /* #ifndef ARM_MATH_CM0 */ 00722 00723 } 00724