收藏 分享(赏)

英特尔多核平台编码优化大赛的优化过程--补充.docx

上传人:myk79025 文档编号:8136138 上传时间:2019-06-10 格式:DOCX 页数:26 大小:34.38KB
下载 相关 举报
英特尔多核平台编码优化大赛的优化过程--补充.docx_第1页
第1页 / 共26页
英特尔多核平台编码优化大赛的优化过程--补充.docx_第2页
第2页 / 共26页
英特尔多核平台编码优化大赛的优化过程--补充.docx_第3页
第3页 / 共26页
英特尔多核平台编码优化大赛的优化过程--补充.docx_第4页
第4页 / 共26页
英特尔多核平台编码优化大赛的优化过程--补充.docx_第5页
第5页 / 共26页
点击查看更多>>
资源描述

1、英特尔多核平台编码优化大赛的优化过程-补充分类: 代码优化 2007-01-20 17:20 2521 人阅读 评论(2) 收藏 举报英特尔多核平台编码优化大赛的优化过程-补充HouSisongGM 2007.01.20tag: 多核编程,sse2,牛顿迭代 ,代码优化,优化大赛,invsqrt, 开方主要文章请参看我的英特尔多核平台编码优化大赛的优化过程:http:/ 本文章是其补充;提供一个完整的 float 实现版本、double 到 float 的手工转换、手工得到 invSqrt 的粗略起始迭代值 等其它几个不算成功的实现;我测试和优化过程中用的 CPU:AMD64x2 3600+

2、(双核 CPU)操作系统:Windows XP 32bit编译器:Visual Studio 2005大赛公布的原始代码执行时间 3.97 秒一个 float 完整实现版本 (牺牲了计算精度),源代码如下:(如果用汇编实现应该还可以把速度提高一些,或者使用 ICC 编译器:)/* compute the potential energy of a collection of */* particles interacting via pairwise potential */#include #include #include #include #include /使用 SSE1#includ

3、e #include /作者:侯思松 HouS float 单精度浮点 版本#define _IS_FAST /以牺牲精度的方式加快速度,否则就运算达到 float 单精度/#define _NEW_TIME_CLOCK#ifdef _NEW_TIME_CLOCK#define clock_t doubledouble CLOCKS_PER_SEC=0.0;inline double clock() _int64 result;if (CLOCKS_PER_SEC=0)QueryPerformanceFrequency(LARGE_INTEGER *)CLOCKS_PER_SEC=(doubl

4、e)result;QueryPerformanceCounter(LARGE_INTEGER *)return (double)result;#else#include #endif#define _IS_USES_MY_RAND/单线程执行 rand 函数,所以使用自定义 rand 是安全的const long DefthreadCount=2; /1,2,4,8,16, 把计算任务分成多个任务并行执行float #define NPARTS 1000#define NITER 201#define DIMS 3#ifdef _IS_USES_MY_RANDclass CMyRandpriv

5、ate:unsigned long _my_holdrand;public:CMyRand():_my_holdrand(1)inline int _my_rand (void)unsigned long result=_my_holdrand * 214013 + 2531011;_my_holdrand =result;return ( (result16) ;CMyRand _MyRand;inline int _my_rand (void) return _MyRand._my_rand(); #else#define _my_rand rand#endifint rand( void

6、 );int computePot();void initPositions(void);void updatePositions(void);_declspec(align(16) float rDIMS(NPARTS+3)/4*4; /16byte 对齐double pot;int main() int i;clock_t start, stop;/char ctmp; std:cinctmp;initPositions();updatePositions();start=clock();for( i=0; iiBegin; iiEnd; i+ ) computePotPart_forj(

7、i,_m128 dt0; dt0=_mm_movehl_ps(lcpot,lcpot);lcpot=_mm_add_ps(lcpot,dt0); dt0=_mm_shuffle_ps(lcpot,lcpot,1);lcpot=_mm_add_ss(lcpot,dt0); work_data-fResult=m128_value(lcpot,0);/工作线程池 TWorkThreadPool/用于把一个任务拆分成多个线程任务/要求每个小任务任务量相近/todo:改成任务领取模式class TWorkThreadPool;typedef void (*TThreadCallBack)(void *

8、 pData);enum TThreadState thrStartup=0, thrReady, thrBusy, thrTerminate, thrDeath ;class TWorkThreadpublic:volatile HANDLE thread_handle;volatile enum TThreadState state;volatile TThreadCallBack func;volatile void * pdata; /work data volatile HANDLE waitfor_event;TWorkThreadPool* pool;volatile DWORD

9、 thread_ThreadAffinityMask;TWorkThread() memset(this,0,sizeof(TWorkThread); ;void do_work_end(TWorkThread* thread_data);void _cdecl thread_dowork(TWorkThread* thread_data) /void _stdcall thread_dowork(TWorkThread* thread_data)volatile TThreadStateSetThreadAffinityMask(GetCurrentThread(),thread_data-

10、thread_ThreadAffinityMask);state = thrStartup;while(true)WaitForSingleObject(thread_data-waitfor_event, -1);if(state = thrTerminate)break;state = thrBusy;volatile TThreadCallBackif (func!=0)func(void *)thread_data-pdata);do_work_end(thread_data);state = thrDeath;_endthread();/ExitThread(0);class TWo

11、rkThreadPoolprivate:volatile HANDLE thread_event;volatile HANDLE new_thread_event;std:vector work_threads;inline int passel_count() const return (int)work_threads.size()+1; void inti_threads(long work_count) SYSTEM_INFO SystemInfo;GetSystemInfo(long cpu_count =SystemInfo.dwNumberOfProcessors;long be

12、st_count =cpu_count;if (cpu_countwork_count) best_count=work_count;long newthrcount=best_count - 1;work_threads.resize(newthrcount);thread_event = CreateSemaphore(0, 0,newthrcount , 0);new_thread_event = CreateSemaphore(0, 0,newthrcount , 0);for(long i = 0; i 0)ReleaseSemaphore(thread_event,thr_coun

13、t, 0);for(i = 0; i =1);/assert(work_count0)ReleaseSemaphore(thread_event,thr_count, 0);/current thread do a workwork_proc(word_data_listwork_count-1);/wait for work finish for(i = 0; i =1); inti_threads(work_count); TWorkThreadPool() free_threads(); long best_work_count() const return passel_count()

14、; void work_execute(TThreadCallBack work_proc,void* word_data_list,int work_count) while (work_count0)long passel_work_count;if (work_count=passel_count()passel_work_count=passel_count();elsepassel_work_count=work_count;passel_work(work_proc,word_data_list,passel_work_count);work_count-=passel_work_

15、count;word_data_list=inline void DoWorkEnd(TWorkThread* thread_data) thread_data-waitfor_event=new_thread_event; thread_data-func=0;thread_data-state = thrReady;void do_work_end(TWorkThread* thread_data)thread_data-pool-DoWorkEnd(thread_data);/TWorkThreadPool end;/static TWorkThreadPool g_work_threa

16、d_pool(DefthreadCount);/线程池int computePot() static bool is_inti_work=false;static TWorkData work_listDefthreadCount;static TWorkData* pwork_listDefthreadCount;int i;if (!is_inti_work)long fi=0;for (int i=0;iDefthreadCount;+i)if (0=i)work_listi.iBegin=0;elsework_listi.iBegin=work_listi-1.iEnd;if (i=D

17、efthreadCount-1)work_listi.iEnd=(long)NPARTS;elsework_listi.iEnd=(long)( (double)(NPARTS-1)*sqrt(double)(i+1)/DefthreadCount)+1+0.5 );pwork_listi=is_inti_work=true;g_work_thread_pool.work_execute(TThreadCallBack)computePotPart,(void *)(for (i=0;iDefthreadCount;+i)pot+=work_listi.fResult;return 0;代码执

18、行时间 0.125 秒 相对于原始代码加速比:3176.0%注意到一个事实,float 比 double 版快出了很多,why? 原来,double 版中为了使用 SSE 而在 float 和 double 之间的转换花费了很多的时间!我不知道这个问题是 AMD64x2 CPU 的问题还是在酷睿 2 上也一样;double 版中为了优化这个转换,我预先保存一份 r 数组的 float 转化值计算,这样就能节约 double 到 float 转换;(double 版完整源代码参见英特尔多核平台编码优化大赛的优化过程)定义一个临时数组 rf:_declspec(align(16) float rf

19、DIMS(NPARTS+3)/4*4; 修改 updatePositions 函数:void updatePositions() int i,j;for (i=0;iDIMS;+i)for( j=0; jNPARTS; +j )rij -=( 0.5 + _my_rand() *(1.0/RAND_MAX) );rfij=(float)rij;然后重新实现 computePotPart_forj:void computePotPart_forj_double_float(int i,_m128d* pResult)_m128d lcpot=_mm_setzero_pd();_m128d _mm

20、i0=_mm_set1_pd(-r0i);_m128d _mmi1=_mm_set1_pd(-r1i);_m128d _mmi2=_mm_set1_pd(-r2i);_m128 _mmi0f=_mm_set1_ps(-rf0i);_m128 _mmi1f=_mm_set1_ps(-rf1i);_m128 _mmi2f=_mm_set1_ps(-rf2i);int j=0;/*for(;j+4i;j+=4)_m128d dm0=_mm_add_pd(*(_m128d*)dm0=_mm_mul_pd(dm0,dm0);_m128d dm1=_mm_add_pd(*(_m128d*)dm1=_mm_

21、mul_pd(dm1,dm1);_m128d dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2,dm2);dm0=_mm_add_pd(dm0,dm1);dm0=_mm_add_pd(dm0,dm2);_m128d dm5=_mm_add_pd(*(_m128d*)dm5=_mm_mul_pd(dm5,dm5);_m128d dm6=_mm_add_pd(*(_m128d*)dm6=_mm_mul_pd(dm6,dm6);dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2,dm2);dm5=_mm_add_pd(dm5,

22、dm6);dm5=_mm_add_pd(dm5,dm2);/用 SSE 的 rsqrt 近似计算 1/sqrt(a); 然后使用牛顿迭代来提高开方精度/ 1/sqrt(a)的牛顿迭代公式 x_next=(3-a*x*x)*x*0.5 =( 1.5 +(a*(-0.5) * x*x) ) * x /* _m128 sm0=_mm_cvtpd_ps(dm0);_m128 sm1=_mm_cvtpd_ps(dm5);sm0=_mm_movelh_ps(sm0,sm1);*/_m128 sm0=_mm_add_ps(*(_m128*)sm0=_mm_mul_ps(sm0,sm0);_m128 sm1=

23、_mm_add_ps(*(_m128*)sm1=_mm_mul_ps(sm1,sm1);_m128 sm2=_mm_add_ps(*(_m128*)sm2=_mm_mul_ps(sm2,sm2);sm0=_mm_add_ps(sm0,sm1);sm0=_mm_add_ps(sm0,sm2);_m128 sma=_mm_mul_ps(sm0,xmms_0_5); /a*(-0.5)sm0=_mm_rsqrt_ps(sm0); /计算 1/sqrt(a)/牛顿迭代,提高开方精度sma=_mm_mul_ps(sma,sm0);sma=_mm_mul_ps(sma,sm0); sma=_mm_add_

24、ps(sma,xmms1_5); sm0=_mm_mul_ps(sm0,sma); _m128d dma=_mm_mul_pd(dm0,xmmd_0_5); /a*(-0.5)_m128d dmb=_mm_mul_pd(dm5,xmmd_0_5);sm1=_mm_movehl_ps(sm1,sm0);dm0=_mm_cvtps_pd(sm0); / dm5=_mm_cvtps_pd(sm1); / /再次迭代,加倍精度 dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(dmb,dm5);dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(d

25、mb,dm5);dma=_mm_add_pd(dma,xmmd1_5);dmb=_mm_add_pd(dmb,xmmd1_5);dm0=_mm_mul_pd(dm0,dma);dm5=_mm_mul_pd(dm5,dmb);lcpot=_mm_add_pd(lcpot,dm0);lcpot=_mm_add_pd(lcpot,dm5);for (;j+1i;+j)_m128d dm0=_mm_set_sd(r0j);dm0=_mm_add_pd(dm0,_mmi0);dm0=_mm_mul_pd(dm0,dm0);_m128d dm1=_mm_set_sd(r1j);dm1=_mm_add_sd

26、(dm1,_mmi1);dm1=_mm_mul_sd(dm1,dm1);_m128d dm2=_mm_set_sd(r2j);dm2=_mm_add_sd(dm2,_mmi2);dm2=_mm_mul_sd(dm2,dm2);dm0=_mm_add_sd(dm0,dm1);dm0=_mm_add_sd(dm0,dm2); _m128 sm0=_mm_cvtpd_ps(dm0);_m128d dma=_mm_mul_sd(dm0,xmmd_0_5); /a*(-0.5)sm0=_mm_rsqrt_ss(sm0); /计算 1/sqrt(a)dm0=_mm_cvtps_pd(sm0); / /牛顿

27、迭代,提高开方精度dm1=_mm_mul_sd(dm0,dm0);dm1=_mm_mul_sd(dm1,dma);dm1=_mm_add_sd(dm1,xmmd1_5);dm0=_mm_mul_sd(dm0,dm1);/再次迭代dma=_mm_mul_sd(dma,dm0);dma=_mm_mul_sd(dma,dm0);dma=_mm_add_sd(dma,xmmd1_5);dm0=_mm_mul_sd(dm0,dma);lcpot=_mm_add_sd(lcpot,dm0);*pResult=_mm_add_pd(*pResult,lcpot);该代码对速度有很小的改进,但精度确只到小数点

28、后 5/6 位(过早的降低了运算精度,后面的计算使误差放大),再增加一次牛顿叠代就能弥补精度的缺失,但速度上就没有优势了;只能放弃该方案;既然硬件的 float 和 double 之间的转换慢,那我手工来写一个会不会更好一些呢? (比较奇怪的尝试:)见代码:(该函数利用了 double/float 的 IEE 浮点编码结构)const _m64 _mmd2f_esub =_mm_set_pi32(1023-127) (52-32),0);const _m128i _xmmd2f_esub=_mm_set_epi64(_mmd2f_esub,_mmd2f_esub); void computeP

29、otPart_forj_d2f(int i,_m128d* pResult)_m128d lcpot=_mm_setzero_pd();_m128d _mmi0=_mm_set1_pd(-r0i);_m128d _mmi1=_mm_set1_pd(-r1i);_m128d _mmi2=_mm_set1_pd(-r2i);int j=0;/*for(;j+4i;j+=4)_m128d dm0=_mm_add_pd(*(_m128d*)dm0=_mm_mul_pd(dm0,dm0);_m128d dm1=_mm_add_pd(*(_m128d*)dm1=_mm_mul_pd(dm1,dm1);_m

30、128d dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2,dm2);dm0=_mm_add_pd(dm0,dm1);dm0=_mm_add_pd(dm0,dm2);_m128d dm5=_mm_add_pd(*(_m128d*)dm5=_mm_mul_pd(dm5,dm5);_m128d dm6=_mm_add_pd(*(_m128d*)dm6=_mm_mul_pd(dm6,dm6);dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2,dm2);dm5=_mm_add_pd(dm5,dm6);dm5=_mm_add_p

31、d(dm5,dm2);/用 SSE 的 rsqrt 近似计算 1/sqrt(a); 然后使用牛顿迭代来提高开方精度/ 1/sqrt(a)的牛顿迭代公式 x_next=(3-a*x*x)*x*0.5 =( 1.5 +(a*(-0.5) * x*x) ) * x _m128d dma=_mm_mul_pd(dm0,xmmd_0_5); /a*(-0.5)_m128d dmb=_mm_mul_pd(dm5,xmmd_0_5);(*(_m128i*)(*(_m128i*)(*(_m128i*)(*(_m128i*)(*(_m128i*)_m128 sm0;(*(_m128i*)_m128 sma=_m

32、m_mul_ps(sm0,xmms_0_5); /a*(-0.5)sm0=_mm_rsqrt_ps(sm0); /计算 1/sqrt(a)/牛顿迭代,提高开方精度sma=_mm_mul_ps(sma,sm0);sma=_mm_mul_ps(sma,sm0); sma=_mm_add_ps(sma,xmms1_5); sm0=_mm_mul_ps(sm0,sma); (*(_m128i*) /(*(_m128i*) / (*(_m128i*) /(*(_m128i*)(*(_m128i*)/再次迭代,加倍精度 dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(dmb,

33、dm5);dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(dmb,dm5);dma=_mm_add_pd(dma,xmmd1_5);dmb=_mm_add_pd(dmb,xmmd1_5);dm0=_mm_mul_pd(dm0,dma);dm5=_mm_mul_pd(dm5,dmb);lcpot=_mm_add_pd(lcpot,dm0);lcpot=_mm_add_pd(lcpot,dm5);for (;j+1i;+j)_m128d dm0=_mm_set_sd(r0j);dm0=_mm_add_pd(dm0,_mmi0);dm0=_mm_mul_pd(dm0,d

34、m0);_m128d dm1=_mm_set_sd(r1j);dm1=_mm_add_sd(dm1,_mmi1);dm1=_mm_mul_sd(dm1,dm1);_m128d dm2=_mm_set_sd(r2j);dm2=_mm_add_sd(dm2,_mmi2);dm2=_mm_mul_sd(dm2,dm2);dm0=_mm_add_sd(dm0,dm1);dm0=_mm_add_sd(dm0,dm2); _m128 sm0=_mm_cvtpd_ps(dm0);_m128d dma=_mm_mul_sd(dm0,xmmd_0_5); /a*(-0.5)sm0=_mm_rsqrt_ss(sm

35、0); /计算 1/sqrt(a)dm0=_mm_cvtps_pd(sm0); / /牛顿迭代,提高开方精度dm1=_mm_mul_sd(dm0,dm0);dm1=_mm_mul_sd(dm1,dma);dm1=_mm_add_sd(dm1,xmmd1_5);dm0=_mm_mul_sd(dm0,dm1);/再次迭代dma=_mm_mul_sd(dma,dm0);dma=_mm_mul_sd(dma,dm0);dma=_mm_add_sd(dma,xmmd1_5);dm0=_mm_mul_sd(dm0,dma);lcpot=_mm_add_sd(lcpot,dm0);*pResult=_mm_

36、add_pd(*pResult,lcpot);该函数的速度比原来的代码稍慢:) 放弃之(我还尝试过这样的代码(代码没有保存:( )代码原理和上面的很接近,利用 IEE 的浮点格式,强制把 double 转成 float 后( 通过指数平衡和移位),分两路使用_mm_rsqrt_ps、牛顿叠代等;但这个慢慢,也放弃了)既然硬件的 float 和 double 之间的转换慢,那我就不转换来看看怎样实现;不使用 SSE 的_mm_rsqrt_ps 指令,而利用 IEE 浮点格式生成一个粗略的近似解,然后迭代(迭代了 3 次);(也可以用查表的方式来得到初始解,但在 SSE 体系中,这种实现很可能得不

37、偿失,所以没有去实现); (该函数利用了 double 的 IEE 浮点编码结构)这样以后,速度有了一些提高,但精度还有点不够(大概有 6 位小数位精度),再次迭代的话就失去了速度优势;不知道有没有比魔法数 0x5fe6ec85,0xe7de30da 更好的魔法数:)放弃;const _m64 _mmi_mn=_mm_set_pi32(0x5fe6ec85,0xe7de30da);const _m128i xmmi64_mn=_mm_set1_epi64(_mmi_mn);void computePotPart_forj_int(int i,_m128d* pResult)_m128d lcp

38、ot=_mm_setzero_pd();_m128d _mmi0=_mm_set1_pd(-r0i);_m128d _mmi1=_mm_set1_pd(-r1i);_m128d _mmi2=_mm_set1_pd(-r2i);int j=0;/*for(;j+4i;j+=4)_m128d dm0=_mm_add_pd(*(_m128d*)dm0=_mm_mul_pd(dm0,dm0);_m128d dm1=_mm_add_pd(*(_m128d*)dm1=_mm_mul_pd(dm1,dm1);_m128d dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2

39、,dm2);dm0=_mm_add_pd(dm0,dm1);dm0=_mm_add_pd(dm0,dm2);_m128d dm5=_mm_add_pd(*(_m128d*)dm5=_mm_mul_pd(dm5,dm5);_m128d dm6=_mm_add_pd(*(_m128d*)dm6=_mm_mul_pd(dm6,dm6);dm2=_mm_add_pd(*(_m128d*)dm2=_mm_mul_pd(dm2,dm2);dm5=_mm_add_pd(dm5,dm6);dm5=_mm_add_pd(dm5,dm2);/利用 IEE double 浮点格式的编码生成 1/sqrt(a)的一个

40、近似值; 然后使用牛顿迭代来提高精度/ 1/sqrt(a)的牛顿迭代公式 x_next=(3-a*x*x)*x*0.5 =( 1.5 +(a*(-0.5) * x*x) ) * x _m128i xmmi0=xmmi64_mn;_m128d dma=_mm_mul_pd(dm0,xmmd_0_5); /a*(-0.5)_m128d dmb=_mm_mul_pd(dm5,xmmd_0_5);*(_m128i*)*(_m128i*)*(_m128i*)*(_m128i*)/迭代,加倍精度dm1=_mm_mul_pd(dma,dm0);dm2=_mm_mul_pd(dmb,dm5);dm1=_mm_

41、mul_pd(dm1,dm0);dm2=_mm_mul_pd(dm2,dm5);dm1=_mm_add_pd(dm1,xmmd1_5);dm2=_mm_add_pd(dm2,xmmd1_5);dm0=_mm_mul_pd(dm0,dm1);dm5=_mm_mul_pd(dm5,dm2);dm1=_mm_mul_pd(dma,dm0);dm2=_mm_mul_pd(dmb,dm5);dm1=_mm_mul_pd(dm1,dm0);dm2=_mm_mul_pd(dm2,dm5);dm1=_mm_add_pd(dm1,xmmd1_5);dm2=_mm_add_pd(dm2,xmmd1_5);dm0=

42、_mm_mul_pd(dm0,dm1);dm5=_mm_mul_pd(dm5,dm2);dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(dmb,dm5);dma=_mm_mul_pd(dma,dm0);dmb=_mm_mul_pd(dmb,dm5);dma=_mm_add_pd(dma,xmmd1_5);dmb=_mm_add_pd(dmb,xmmd1_5);dm0=_mm_mul_pd(dm0,dma);dm5=_mm_mul_pd(dm5,dmb);/精度还有点不够 但再次迭代的话就失去了速度优势lcpot=_mm_add_pd(lcpot,dm0);lcpo

43、t=_mm_add_pd(lcpot,dm5);for (;j+1i;+j)_m128d dm0=_mm_set_sd(r0j);dm0=_mm_add_pd(dm0,_mmi0);dm0=_mm_mul_pd(dm0,dm0);_m128d dm1=_mm_set_sd(r1j);dm1=_mm_add_sd(dm1,_mmi1);dm1=_mm_mul_sd(dm1,dm1);_m128d dm2=_mm_set_sd(r2j);dm2=_mm_add_sd(dm2,_mmi2);dm2=_mm_mul_sd(dm2,dm2);dm0=_mm_add_sd(dm0,dm1);dm0=_mm

44、_add_sd(dm0,dm2); _m128 sm0=_mm_cvtpd_ps(dm0);_m128d dma=_mm_mul_sd(dm0,xmmd_0_5); /a*(-0.5)sm0=_mm_rsqrt_ss(sm0); /计算 1/sqrt(a)dm0=_mm_cvtps_pd(sm0); / /牛顿迭代,提高开方精度dm1=_mm_mul_sd(dm0,dm0);dm1=_mm_mul_sd(dm1,dma);dm1=_mm_add_sd(dm1,xmmd1_5);dm0=_mm_mul_sd(dm0,dm1);/再次迭代dma=_mm_mul_sd(dma,dm0);dma=_mm_mul_sd(dma,dm0);dma=_mm_add_sd(dma,xmmd1_5);dm0=_mm_mul_sd(dm0,dma);lcpot=_mm_add_sd(lcpot,dm0);*pResult=_mm_add_pd(*pResult,lcpot);还想到一个没有时间去实现的方案:由于 SSE 和 x87 是独立的两个硬件,那么可以使它们并行执行;SSE 部件代码不变,但循环做更大的展开,然后让 x87 承担一路或两路运算;

展开阅读全文
相关资源
猜你喜欢
相关搜索

当前位置:首页 > 企业管理 > 管理学资料

本站链接:文库   一言   我酷   合作


客服QQ:2549714901微博号:道客多多官方知乎号:道客多多

经营许可证编号: 粤ICP备2021046453号世界地图

道客多多©版权所有2020-2025营业执照举报