00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q7.c 00009 * 00010 * Description: Correlation of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00065 void arm_correlate_q7( 00066 q7_t * pSrcA, 00067 uint32_t srcALen, 00068 q7_t * pSrcB, 00069 uint32_t srcBLen, 00070 q7_t * pDst) 00071 { 00072 00073 00074 #ifndef ARM_MATH_CM0 00075 00076 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00077 00078 q7_t *pIn1; /* inputA pointer */ 00079 q7_t *pIn2; /* inputB pointer */ 00080 q7_t *pOut = pDst; /* output pointer */ 00081 q7_t *px; /* Intermediate inputA pointer */ 00082 q7_t *py; /* Intermediate inputB pointer */ 00083 q7_t *pSrc1; /* Intermediate pointers */ 00084 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00085 q31_t input1, input2; /* temporary variables */ 00086 q15_t in1, in2; /* temporary variables */ 00087 q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */ 00088 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00089 int32_t inc = 1; 00090 00091 00092 /* The algorithm implementation is based on the lengths of the inputs. */ 00093 /* srcB is always made to slide across srcA. */ 00094 /* So srcBLen is always considered as shorter or equal to srcALen */ 00095 /* But CORR(x, y) is reverse of CORR(y, x) */ 00096 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00097 /* and the destination pointer modifier, inc is set to -1 */ 00098 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00099 /* But to improve the performance, 00100 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00101 /* If srcALen > srcBLen, 00102 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00103 /* If srcALen < srcBLen, 00104 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00105 if(srcALen >= srcBLen) 00106 { 00107 /* Initialization of inputA pointer */ 00108 pIn1 = (pSrcA); 00109 00110 /* Initialization of inputB pointer */ 00111 pIn2 = (pSrcB); 00112 00113 /* Number of output samples is calculated */ 00114 outBlockSize = (2u * srcALen) - 1u; 00115 00116 /* When srcALen > srcBLen, zero padding is done to srcB 00117 * to make their lengths equal. 00118 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00119 * number of output samples are made zero */ 00120 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00121 00122 /* Updating the pointer position to non zero value */ 00123 pOut += j; 00124 00125 } 00126 else 00127 { 00128 /* Initialization of inputA pointer */ 00129 pIn1 = (pSrcB); 00130 00131 /* Initialization of inputB pointer */ 00132 pIn2 = (pSrcA); 00133 00134 /* srcBLen is always considered as shorter or equal to srcALen */ 00135 j = srcBLen; 00136 srcBLen = srcALen; 00137 srcALen = j; 00138 00139 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00140 /* Hence set the destination pointer to point to the last output sample */ 00141 pOut = pDst + ((srcALen + srcBLen) - 2u); 00142 00143 /* Destination address modifier is set to -1 */ 00144 inc = -1; 00145 00146 } 00147 00148 /* The function is internally 00149 * divided into three parts according to the number of multiplications that has to be 00150 * taken place between inputA samples and inputB samples. In the first part of the 00151 * algorithm, the multiplications increase by one for every iteration. 00152 * In the second part of the algorithm, srcBLen number of multiplications are done. 00153 * In the third part of the algorithm, the multiplications decrease by one 00154 * for every iteration.*/ 00155 /* The algorithm is implemented in three stages. 00156 * The loop counters of each stage is initiated here. */ 00157 blockSize1 = srcBLen - 1u; 00158 blockSize2 = srcALen - (srcBLen - 1u); 00159 blockSize3 = blockSize1; 00160 00161 /* -------------------------- 00162 * Initializations of stage1 00163 * -------------------------*/ 00164 00165 /* sum = x[0] * y[srcBlen - 1] 00166 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00167 * .... 00168 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00169 */ 00170 00171 /* In this stage the MAC operations are increased by 1 for every iteration. 00172 The count variable holds the number of MAC operations performed */ 00173 count = 1u; 00174 00175 /* Working pointer of inputA */ 00176 px = pIn1; 00177 00178 /* Working pointer of inputB */ 00179 pSrc1 = pIn2 + (srcBLen - 1u); 00180 py = pSrc1; 00181 00182 /* ------------------------ 00183 * Stage1 process 00184 * ----------------------*/ 00185 00186 /* The first stage starts here */ 00187 while(blockSize1 > 0u) 00188 { 00189 /* Accumulator is made zero for every iteration */ 00190 sum = 0; 00191 00192 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00193 k = count >> 2; 00194 00195 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00196 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00197 while(k > 0u) 00198 { 00199 /* x[0] , x[1] */ 00200 in1 = (q15_t) * px++; 00201 in2 = (q15_t) * px++; 00202 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00203 00204 /* y[srcBLen - 4] , y[srcBLen - 3] */ 00205 in1 = (q15_t) * py++; 00206 in2 = (q15_t) * py++; 00207 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00208 00209 /* x[0] * y[srcBLen - 4] */ 00210 /* x[1] * y[srcBLen - 3] */ 00211 sum = __SMLAD(input1, input2, sum); 00212 00213 /* x[2] , x[3] */ 00214 in1 = (q15_t) * px++; 00215 in2 = (q15_t) * px++; 00216 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00217 00218 /* y[srcBLen - 2] , y[srcBLen - 1] */ 00219 in1 = (q15_t) * py++; 00220 in2 = (q15_t) * py++; 00221 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00222 00223 /* x[2] * y[srcBLen - 2] */ 00224 /* x[3] * y[srcBLen - 1] */ 00225 sum = __SMLAD(input1, input2, sum); 00226 00227 00228 /* Decrement the loop counter */ 00229 k--; 00230 } 00231 00232 /* If the count is not a multiple of 4, compute any remaining MACs here. 00233 ** No loop unrolling is used. */ 00234 k = count % 0x4u; 00235 00236 while(k > 0u) 00237 { 00238 /* Perform the multiply-accumulates */ 00239 /* x[0] * y[srcBLen - 1] */ 00240 sum += (q31_t) ((q15_t) * px++ * *py++); 00241 00242 /* Decrement the loop counter */ 00243 k--; 00244 } 00245 00246 /* Store the result in the accumulator in the destination buffer. */ 00247 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00248 /* Destination pointer is updated according to the address modifier, inc */ 00249 pOut += inc; 00250 00251 /* Update the inputA and inputB pointers for next MAC calculation */ 00252 py = pSrc1 - count; 00253 px = pIn1; 00254 00255 /* Increment the MAC count */ 00256 count++; 00257 00258 /* Decrement the loop counter */ 00259 blockSize1--; 00260 } 00261 00262 /* -------------------------- 00263 * Initializations of stage2 00264 * ------------------------*/ 00265 00266 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00267 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00268 * .... 00269 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00270 */ 00271 00272 /* Working pointer of inputA */ 00273 px = pIn1; 00274 00275 /* Working pointer of inputB */ 00276 py = pIn2; 00277 00278 /* count is index by which the pointer pIn1 to be incremented */ 00279 count = 1u; 00280 00281 /* ------------------- 00282 * Stage2 process 00283 * ------------------*/ 00284 00285 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00286 * So, to loop unroll over blockSize2, 00287 * srcBLen should be greater than or equal to 4 */ 00288 if(srcBLen >= 4u) 00289 { 00290 /* Loop unroll over blockSize2, by 4 */ 00291 blkCnt = blockSize2 >> 2u; 00292 00293 while(blkCnt > 0u) 00294 { 00295 /* Set all accumulators to zero */ 00296 acc0 = 0; 00297 acc1 = 0; 00298 acc2 = 0; 00299 acc3 = 0; 00300 00301 /* read x[0], x[1], x[2] samples */ 00302 x0 = *px++; 00303 x1 = *px++; 00304 x2 = *px++; 00305 00306 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00307 k = srcBLen >> 2u; 00308 00309 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00310 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00311 do 00312 { 00313 /* Read y[0] sample */ 00314 c0 = *py++; 00315 /* Read y[1] sample */ 00316 c1 = *py++; 00317 00318 /* Read x[3] sample */ 00319 x3 = *px++; 00320 00321 /* x[0] and x[1] are packed */ 00322 in1 = (q15_t) x0; 00323 in2 = (q15_t) x1; 00324 00325 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00326 00327 /* y[0] and y[1] are packed */ 00328 in1 = (q15_t) c0; 00329 in2 = (q15_t) c1; 00330 00331 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00332 00333 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00334 acc0 = __SMLAD(input1, input2, acc0); 00335 00336 /* x[1] and x[2] are packed */ 00337 in1 = (q15_t) x1; 00338 in2 = (q15_t) x2; 00339 00340 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00341 00342 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00343 acc1 = __SMLAD(input1, input2, acc1); 00344 00345 /* x[2] and x[3] are packed */ 00346 in1 = (q15_t) x2; 00347 in2 = (q15_t) x3; 00348 00349 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00350 00351 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00352 acc2 = __SMLAD(input1, input2, acc2); 00353 00354 /* Read x[4] sample */ 00355 x0 = *(px++); 00356 00357 /* x[3] and x[4] are packed */ 00358 in1 = (q15_t) x3; 00359 in2 = (q15_t) x0; 00360 00361 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00362 00363 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00364 acc3 = __SMLAD(input1, input2, acc3); 00365 00366 /* Read y[2] sample */ 00367 c0 = *py++; 00368 /* Read y[3] sample */ 00369 c1 = *py++; 00370 00371 /* Read x[5] sample */ 00372 x1 = *px++; 00373 00374 /* x[2] and x[3] are packed */ 00375 in1 = (q15_t) x2; 00376 in2 = (q15_t) x3; 00377 00378 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00379 00380 /* y[2] and y[3] are packed */ 00381 in1 = (q15_t) c0; 00382 in2 = (q15_t) c1; 00383 00384 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00385 00386 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00387 acc0 = __SMLAD(input1, input2, acc0); 00388 00389 /* x[3] and x[4] are packed */ 00390 in1 = (q15_t) x3; 00391 in2 = (q15_t) x0; 00392 00393 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00394 00395 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00396 acc1 = __SMLAD(input1, input2, acc1); 00397 00398 /* x[4] and x[5] are packed */ 00399 in1 = (q15_t) x0; 00400 in2 = (q15_t) x1; 00401 00402 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00403 00404 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00405 acc2 = __SMLAD(input1, input2, acc2); 00406 00407 /* Read x[6] sample */ 00408 x2 = *px++; 00409 00410 /* x[5] and x[6] are packed */ 00411 in1 = (q15_t) x1; 00412 in2 = (q15_t) x2; 00413 00414 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00415 00416 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00417 acc3 = __SMLAD(input1, input2, acc3); 00418 00419 } while(--k); 00420 00421 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00422 ** No loop unrolling is used. */ 00423 k = srcBLen % 0x4u; 00424 00425 while(k > 0u) 00426 { 00427 /* Read y[4] sample */ 00428 c0 = *py++; 00429 00430 /* Read x[7] sample */ 00431 x3 = *px++; 00432 00433 /* Perform the multiply-accumulates */ 00434 /* acc0 += x[4] * y[4] */ 00435 acc0 += ((q15_t) x0 * c0); 00436 /* acc1 += x[5] * y[4] */ 00437 acc1 += ((q15_t) x1 * c0); 00438 /* acc2 += x[6] * y[4] */ 00439 acc2 += ((q15_t) x2 * c0); 00440 /* acc3 += x[7] * y[4] */ 00441 acc3 += ((q15_t) x3 * c0); 00442 00443 /* Reuse the present samples for the next MAC */ 00444 x0 = x1; 00445 x1 = x2; 00446 x2 = x3; 00447 00448 /* Decrement the loop counter */ 00449 k--; 00450 } 00451 00452 /* Store the result in the accumulator in the destination buffer. */ 00453 *pOut = (q7_t) (__SSAT(acc0 >> 7, 8)); 00454 /* Destination pointer is updated according to the address modifier, inc */ 00455 pOut += inc; 00456 00457 *pOut = (q7_t) (__SSAT(acc1 >> 7, 8)); 00458 pOut += inc; 00459 00460 *pOut = (q7_t) (__SSAT(acc2 >> 7, 8)); 00461 pOut += inc; 00462 00463 *pOut = (q7_t) (__SSAT(acc3 >> 7, 8)); 00464 pOut += inc; 00465 00466 /* Update the inputA and inputB pointers for next MAC calculation */ 00467 px = pIn1 + (count * 4u); 00468 py = pIn2; 00469 00470 /* Increment the pointer pIn1 index, count by 1 */ 00471 count++; 00472 00473 /* Decrement the loop counter */ 00474 blkCnt--; 00475 } 00476 00477 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00478 ** No loop unrolling is used. */ 00479 blkCnt = blockSize2 % 0x4u; 00480 00481 while(blkCnt > 0u) 00482 { 00483 /* Accumulator is made zero for every iteration */ 00484 sum = 0; 00485 00486 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00487 k = srcBLen >> 2u; 00488 00489 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00490 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00491 while(k > 0u) 00492 { 00493 /* Reading two inputs of SrcA buffer and packing */ 00494 in1 = (q15_t) * px++; 00495 in2 = (q15_t) * px++; 00496 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00497 00498 /* Reading two inputs of SrcB buffer and packing */ 00499 in1 = (q15_t) * py++; 00500 in2 = (q15_t) * py++; 00501 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00502 00503 /* Perform the multiply-accumulates */ 00504 sum = __SMLAD(input1, input2, sum); 00505 00506 /* Reading two inputs of SrcA buffer and packing */ 00507 in1 = (q15_t) * px++; 00508 in2 = (q15_t) * px++; 00509 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00510 00511 /* Reading two inputs of SrcB buffer and packing */ 00512 in1 = (q15_t) * py++; 00513 in2 = (q15_t) * py++; 00514 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00515 00516 /* Perform the multiply-accumulates */ 00517 sum = __SMLAD(input1, input2, sum); 00518 00519 /* Decrement the loop counter */ 00520 k--; 00521 } 00522 00523 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00524 ** No loop unrolling is used. */ 00525 k = srcBLen % 0x4u; 00526 00527 while(k > 0u) 00528 { 00529 /* Perform the multiply-accumulates */ 00530 sum += ((q15_t) * px++ * *py++); 00531 00532 /* Decrement the loop counter */ 00533 k--; 00534 } 00535 00536 /* Store the result in the accumulator in the destination buffer. */ 00537 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00538 /* Destination pointer is updated according to the address modifier, inc */ 00539 pOut += inc; 00540 00541 /* Update the inputA and inputB pointers for next MAC calculation */ 00542 px = pIn1 + count; 00543 py = pIn2; 00544 00545 /* Increment the pointer pIn1 index, count by 1 */ 00546 count++; 00547 00548 /* Decrement the loop counter */ 00549 blkCnt--; 00550 } 00551 } 00552 else 00553 { 00554 /* If the srcBLen is not a multiple of 4, 00555 * the blockSize2 loop cannot be unrolled by 4 */ 00556 blkCnt = blockSize2; 00557 00558 while(blkCnt > 0u) 00559 { 00560 /* Accumulator is made zero for every iteration */ 00561 sum = 0; 00562 00563 /* Loop over srcBLen */ 00564 k = srcBLen; 00565 00566 while(k > 0u) 00567 { 00568 /* Perform the multiply-accumulate */ 00569 sum += ((q15_t) * px++ * *py++); 00570 00571 /* Decrement the loop counter */ 00572 k--; 00573 } 00574 00575 /* Store the result in the accumulator in the destination buffer. */ 00576 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00577 /* Destination pointer is updated according to the address modifier, inc */ 00578 pOut += inc; 00579 00580 /* Update the inputA and inputB pointers for next MAC calculation */ 00581 px = pIn1 + count; 00582 py = pIn2; 00583 00584 /* Increment the MAC count */ 00585 count++; 00586 00587 /* Decrement the loop counter */ 00588 blkCnt--; 00589 } 00590 } 00591 00592 /* -------------------------- 00593 * Initializations of stage3 00594 * -------------------------*/ 00595 00596 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00597 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00598 * .... 00599 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00600 * sum += x[srcALen-1] * y[0] 00601 */ 00602 00603 /* In this stage the MAC operations are decreased by 1 for every iteration. 00604 The count variable holds the number of MAC operations performed */ 00605 count = srcBLen - 1u; 00606 00607 /* Working pointer of inputA */ 00608 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00609 px = pSrc1; 00610 00611 /* Working pointer of inputB */ 00612 py = pIn2; 00613 00614 /* ------------------- 00615 * Stage3 process 00616 * ------------------*/ 00617 00618 while(blockSize3 > 0u) 00619 { 00620 /* Accumulator is made zero for every iteration */ 00621 sum = 0; 00622 00623 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00624 k = count >> 2u; 00625 00626 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00627 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00628 while(k > 0u) 00629 { 00630 /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */ 00631 in1 = (q15_t) * px++; 00632 in2 = (q15_t) * px++; 00633 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00634 00635 /* y[0] , y[1] */ 00636 in1 = (q15_t) * py++; 00637 in2 = (q15_t) * py++; 00638 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00639 00640 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00641 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00642 sum = __SMLAD(input1, input2, sum); 00643 00644 /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */ 00645 in1 = (q15_t) * px++; 00646 in2 = (q15_t) * px++; 00647 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00648 00649 /* y[2] , y[3] */ 00650 in1 = (q15_t) * py++; 00651 in2 = (q15_t) * py++; 00652 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00653 00654 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00655 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00656 sum = __SMLAD(input1, input2, sum); 00657 00658 /* Decrement the loop counter */ 00659 k--; 00660 } 00661 00662 /* If the count is not a multiple of 4, compute any remaining MACs here. 00663 ** No loop unrolling is used. */ 00664 k = count % 0x4u; 00665 00666 while(k > 0u) 00667 { 00668 /* Perform the multiply-accumulates */ 00669 sum += ((q15_t) * px++ * *py++); 00670 00671 /* Decrement the loop counter */ 00672 k--; 00673 } 00674 00675 /* Store the result in the accumulator in the destination buffer. */ 00676 *pOut = (q7_t) (__SSAT(sum >> 7, 8)); 00677 /* Destination pointer is updated according to the address modifier, inc */ 00678 pOut += inc; 00679 00680 /* Update the inputA and inputB pointers for next MAC calculation */ 00681 px = ++pSrc1; 00682 py = pIn2; 00683 00684 /* Decrement the MAC count */ 00685 count--; 00686 00687 /* Decrement the loop counter */ 00688 blockSize3--; 00689 } 00690 00691 #else 00692 00693 /* Run the below code for Cortex-M0 */ 00694 00695 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00696 q7_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */ 00697 q31_t sum; /* Accumulator */ 00698 uint32_t i = 0u, j; /* loop counters */ 00699 uint32_t inv = 0u; /* Reverse order flag */ 00700 uint32_t tot = 0u; /* Length */ 00701 00702 /* The algorithm implementation is based on the lengths of the inputs. */ 00703 /* srcB is always made to slide across srcA. */ 00704 /* So srcBLen is always considered as shorter or equal to srcALen */ 00705 /* But CORR(x, y) is reverse of CORR(y, x) */ 00706 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00707 /* and a varaible, inv is set to 1 */ 00708 /* If lengths are not equal then zero pad has to be done to make the two 00709 * inputs of same length. But to improve the performance, we include zeroes 00710 * in the output instead of zero padding either of the the inputs*/ 00711 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the 00712 * starting of the output buffer */ 00713 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the 00714 * ending of the output buffer */ 00715 /* Once the zero padding is done the remaining of the output is calcualted 00716 * using convolution but with the shorter signal time shifted. */ 00717 00718 /* Calculate the length of the remaining sequence */ 00719 tot = ((srcALen + srcBLen) - 2u); 00720 00721 if(srcALen > srcBLen) 00722 { 00723 /* Calculating the number of zeros to be padded to the output */ 00724 j = srcALen - srcBLen; 00725 00726 /* Initialise the pointer after zero padding */ 00727 pDst += j; 00728 } 00729 00730 else if(srcALen < srcBLen) 00731 { 00732 /* Initialization to inputB pointer */ 00733 pIn1 = pSrcB; 00734 00735 /* Initialization to the end of inputA pointer */ 00736 pIn2 = pSrcA + (srcALen - 1u); 00737 00738 /* Initialisation of the pointer after zero padding */ 00739 pDst = pDst + tot; 00740 00741 /* Swapping the lengths */ 00742 j = srcALen; 00743 srcALen = srcBLen; 00744 srcBLen = j; 00745 00746 /* Setting the reverse flag */ 00747 inv = 1; 00748 00749 } 00750 00751 /* Loop to calculate convolution for output length number of times */ 00752 for (i = 0u; i <= tot; i++) 00753 { 00754 /* Initialize sum with zero to carry on MAC operations */ 00755 sum = 0; 00756 00757 /* Loop to perform MAC operations according to convolution equation */ 00758 for (j = 0u; j <= i; j++) 00759 { 00760 /* Check the array limitations */ 00761 if((((i - j) < srcBLen) && (j < srcALen))) 00762 { 00763 /* z[i] += x[i-j] * y[j] */ 00764 sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]); 00765 } 00766 } 00767 /* Store the output in the destination buffer */ 00768 if(inv == 1) 00769 *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u); 00770 else 00771 *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u); 00772 } 00773 00774 #endif /* #ifndef ARM_MATH_CM0 */ 00775 00776 } 00777