aboutsummaryrefslogtreecommitdiff
path: root/REORG.TODO/sysdeps/i386
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/sysdeps/i386')
-rw-r--r--REORG.TODO/sysdeps/i386/Implies5
-rw-r--r--REORG.TODO/sysdeps/i386/Makefile103
-rw-r--r--REORG.TODO/sysdeps/i386/Versions35
-rw-r--r--REORG.TODO/sysdeps/i386/____longjmp_chk.S1
-rw-r--r--REORG.TODO/sysdeps/i386/__longjmp.S72
-rw-r--r--REORG.TODO/sysdeps/i386/abort-instr.h2
-rw-r--r--REORG.TODO/sysdeps/i386/add_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/addmul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/asm-syntax.h24
-rw-r--r--REORG.TODO/sysdeps/i386/atomic-machine.h545
-rw-r--r--REORG.TODO/sysdeps/i386/backtrace.c163
-rw-r--r--REORG.TODO/sysdeps/i386/bcopy.S4
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-_setjmp.S56
-rw-r--r--REORG.TODO/sysdeps/i386/bsd-setjmp.S66
-rw-r--r--REORG.TODO/sysdeps/i386/bzero.S5
-rw-r--r--REORG.TODO/sysdeps/i386/cacheinfo.c3
-rw-r--r--REORG.TODO/sysdeps/i386/configure84
-rw-r--r--REORG.TODO/sysdeps/i386/configure.ac52
-rw-r--r--REORG.TODO/sysdeps/i386/crti.S84
-rw-r--r--REORG.TODO/sysdeps/i386/crtn.S47
-rw-r--r--REORG.TODO/sysdeps/i386/dl-irel.h51
-rw-r--r--REORG.TODO/sysdeps/i386/dl-lookupcfg.h32
-rw-r--r--REORG.TODO/sysdeps/i386/dl-machine.h757
-rw-r--r--REORG.TODO/sysdeps/i386/dl-procinfo.c65
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tls.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.S285
-rw-r--r--REORG.TODO/sysdeps/i386/dl-tlsdesc.h61
-rw-r--r--REORG.TODO/sysdeps/i386/dl-trampoline.S215
-rw-r--r--REORG.TODO/sysdeps/i386/ffs.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Implies1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/Versions6
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/doasin.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acos.S25
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosh.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshf.S101
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acoshl.S107
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_acosl.c29
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asin.S38
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_asinf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2f.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atan2l.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanh.S112
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhf.S109
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_atanhl.S127
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10f.S53
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp10l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2f.S52
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_exp2l.S60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expf.S74
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_expl.S226
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmod.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodf.S19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_fmodl.c23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypot.S75
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_hypotf.S64
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogb.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10.S68
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log10l.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2f.S69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_log2l.S70
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logf.S93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_logl.S97
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_pow.S456
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powf.S392
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_powl.S459
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c3
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainder.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderf.S18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_remainderl.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalb.S100
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbf.S102
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_scalbl.S90
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrt.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S13
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetenv.c49
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetmode.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fegetround.c33
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fenv_private.h501
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetenv.c131
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetexcept.c31
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetmode.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fesetround.c54
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/feupdateenv.c60
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c57
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c124
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c69
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/ftestexcept.c40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/halfulp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h340
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps2202
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math-tests.h27
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/math_private.h7
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpatan2.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mplog.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/mpsqrt.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinh.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhf.S139
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_asinhl.S144
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atan.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_atanl.c22
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrt.S200
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S177
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S229
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceil.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceilf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_ceill.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysign.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_copysignl.S21
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1f.S113
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_expm1l.S2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabs.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsf.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fabsl.S9
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fdim.c50
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finite.S17
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitef.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_finitel.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floor.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_floorl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmax.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fmin.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminf.S43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fminl.S71
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c42
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexp.S83
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpf.S80
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_frexpl.S92
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isinfl.c32
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_isnanl.c43
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrint.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintf.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_llrintl.S36
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1p.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pf.S67
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_log1pl.S76
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logb.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_logbl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrint.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintf.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_lrintl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S20
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c125
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c93
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c77
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquo.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquof.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_remquol.S45
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rint.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintf.S15
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_rintl.c18
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbln.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c2
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbn.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S24
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S23
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significand.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandf.S16
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_significandl.c19
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_trunc.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/s_truncl.S40
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowexp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/slowpow.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/t_exp.c1
-rw-r--r--REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c8
-rw-r--r--REORG.TODO/sysdeps/i386/gccframe.h27
-rw-r--r--REORG.TODO/sysdeps/i386/gmp-mparam.h28
-rw-r--r--REORG.TODO/sysdeps/i386/htonl.S34
-rw-r--r--REORG.TODO/sysdeps/i386/htons.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i386-mcount.S79
-rw-r--r--REORG.TODO/sysdeps/i386/i586/add_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/addmul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i586/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i586/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i586/lshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcopy.h95
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memcpy.S124
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mempcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memset.S121
-rw-r--r--REORG.TODO/sysdeps/i386/i586/memusage.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i586/mul_1.S90
-rw-r--r--REORG.TODO/sysdeps/i386/i586/rshift.S255
-rw-r--r--REORG.TODO/sysdeps/i386/i586/stpcpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strchr.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strcpy.S169
-rw-r--r--REORG.TODO/sysdeps/i386/i586/strlen.S182
-rw-r--r--REORG.TODO/sysdeps/i386/i586/sub_n.S143
-rw-r--r--REORG.TODO/sysdeps/i386/i586/submul_1.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/Makefile12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/add_n.S110
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bcopy.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/bzero.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/dl-hash.h79
-rw-r--r--REORG.TODO/sysdeps/i386/i686/ffs.c48
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_log.S29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S325
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps2188
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S553
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c29
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S586
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c30
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S566
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c28
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S58
-rw-r--r--REORG.TODO/sysdeps/i386/i686/hp-timing.h42
-rw-r--r--REORG.TODO/sysdeps/i386/i686/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcmp.S408
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memcpy.S98
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memmove.S120
-rw-r--r--REORG.TODO/sysdeps/i386/i686/mempcpy.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memset.S100
-rw-r--r--REORG.TODO/sysdeps/i386/i686/memusage.h21
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/Makefile44
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S59
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c376
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym11
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S502
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S709
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S1225
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S2157
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S62
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1809
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S3162
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S78
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S89
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S94
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S81
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S50
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S417
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S724
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S45
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S811
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S860
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S82
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S65
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c27
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c34
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c12
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S1245
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S572
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S92
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S158
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S348
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S804
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S2810
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S95
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S2250
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S3901
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S116
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S75
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S125
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S695
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S60
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c13
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S7
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c10
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S282
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S708
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S57
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S56
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c22
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S219
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c14
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S1018
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S39
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S600
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S36
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S193
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S37
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S354
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c9
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S40
-rw-r--r--REORG.TODO/sysdeps/i386/i686/nptl/tls.h35
-rw-r--r--REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S20
-rw-r--r--REORG.TODO/sysdeps/i386/i686/stack-aliasing.h23
-rw-r--r--REORG.TODO/sysdeps/i386/i686/strcmp.S52
-rw-r--r--REORG.TODO/sysdeps/i386/i686/tst-stack-align.h44
-rw-r--r--REORG.TODO/sysdeps/i386/i786/Implies2
-rw-r--r--REORG.TODO/sysdeps/i386/init-arch.h19
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-offsets.h25
-rw-r--r--REORG.TODO/sysdeps/i386/jmpbuf-unwind.h47
-rw-r--r--REORG.TODO/sysdeps/i386/ldbl2mpn.c120
-rw-r--r--REORG.TODO/sysdeps/i386/ldsodefs.h41
-rw-r--r--REORG.TODO/sysdeps/i386/link-defines.sym20
-rw-r--r--REORG.TODO/sysdeps/i386/lshift.S103
-rw-r--r--REORG.TODO/sysdeps/i386/machine-gmon.h40
-rw-r--r--REORG.TODO/sysdeps/i386/memchr.S322
-rw-r--r--REORG.TODO/sysdeps/i386/memcmp.S73
-rw-r--r--REORG.TODO/sysdeps/i386/memcopy.h92
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy.S95
-rw-r--r--REORG.TODO/sysdeps/i386/memcpy_chk.S34
-rw-r--r--REORG.TODO/sysdeps/i386/memmove.S4
-rw-r--r--REORG.TODO/sysdeps/i386/memmove_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy.S7
-rw-r--r--REORG.TODO/sysdeps/i386/mempcpy_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memset.S68
-rw-r--r--REORG.TODO/sysdeps/i386/memset_chk.S33
-rw-r--r--REORG.TODO/sysdeps/i386/memusage.h20
-rw-r--r--REORG.TODO/sysdeps/i386/mp_clz_tab.c1
-rw-r--r--REORG.TODO/sysdeps/i386/mul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/Makefile26
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c19
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S37
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S31
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/pthreaddef.h40
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/nptl/tls.h435
-rw-r--r--REORG.TODO/sysdeps/i386/preconfigure5
-rw-r--r--REORG.TODO/sysdeps/i386/pthread_spin_trylock.S46
-rw-r--r--REORG.TODO/sysdeps/i386/rawmemchr.S222
-rw-r--r--REORG.TODO/sysdeps/i386/rshift.S105
-rw-r--r--REORG.TODO/sysdeps/i386/setfpucw.c54
-rw-r--r--REORG.TODO/sysdeps/i386/setjmp.S58
-rw-r--r--REORG.TODO/sysdeps/i386/stackguard-macros.h12
-rw-r--r--REORG.TODO/sysdeps/i386/stackinfo.h43
-rw-r--r--REORG.TODO/sysdeps/i386/start.S139
-rw-r--r--REORG.TODO/sysdeps/i386/stpcpy.S88
-rw-r--r--REORG.TODO/sysdeps/i386/stpncpy.S147
-rw-r--r--REORG.TODO/sysdeps/i386/strcat.S265
-rw-r--r--REORG.TODO/sysdeps/i386/strchr.S290
-rw-r--r--REORG.TODO/sysdeps/i386/strchrnul.S278
-rw-r--r--REORG.TODO/sysdeps/i386/strcspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/string-inlines.c47
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.S132
-rw-r--r--REORG.TODO/sysdeps/i386/strlen.c35
-rw-r--r--REORG.TODO/sysdeps/i386/strpbrk.S243
-rw-r--r--REORG.TODO/sysdeps/i386/strrchr.S334
-rw-r--r--REORG.TODO/sysdeps/i386/strspn.S240
-rw-r--r--REORG.TODO/sysdeps/i386/sub_n.S111
-rw-r--r--REORG.TODO/sysdeps/i386/submul_1.S86
-rw-r--r--REORG.TODO/sysdeps/i386/symbol-hacks.h21
-rw-r--r--REORG.TODO/sysdeps/i386/sys/ucontext.h139
-rw-r--r--REORG.TODO/sysdeps/i386/sysdep.h159
-rw-r--r--REORG.TODO/sysdeps/i386/tls-macros.h78
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.c268
-rw-r--r--REORG.TODO/sysdeps/i386/tlsdesc.sym17
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit.h25
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.c37
-rw-r--r--REORG.TODO/sysdeps/i386/tst-audit3.h20
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3a.c38
-rw-r--r--REORG.TODO/sysdeps/i386/tst-auditmod3b.c186
-rwxr-xr-xREORG.TODO/sysdeps/i386/tst-ld-sse-use.sh103
-rw-r--r--REORG.TODO/sysdeps/i386/tst-stack-align.h41
450 files changed, 62011 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/Implies b/REORG.TODO/sysdeps/i386/Implies
new file mode 100644
index 0000000000..20b2dffc29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Implies
@@ -0,0 +1,5 @@
+x86
+wordsize-32
+ieee754/ldbl-96
+ieee754/dbl-64
+ieee754/flt-32
diff --git a/REORG.TODO/sysdeps/i386/Makefile b/REORG.TODO/sysdeps/i386/Makefile
new file mode 100644
index 0000000000..e30e1339f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Makefile
@@ -0,0 +1,103 @@
+# The mpn functions need a #define for asm syntax flavor.
+# Every i386 port in use uses gas syntax (I think).
+asm-CPPFLAGS += -DGAS_SYNTAX
+
+# The i386 `long double' is a distinct type we support.
+long-double-fcts = yes
+
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo
+endif
+
+ifeq ($(subdir),gmon)
+sysdep_routines += i386-mcount
+endif
+
+ifeq ($(subdir),elf)
+CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
+CFLAGS-dl-load.c += -Wno-unused
+CFLAGS-dl-reloc.c += -Wno-unused
+endif
+
+ifeq ($(subdir),debug)
+CFLAGS-backtrace.c += -fexceptions
+endif
+
+# Most of the glibc routines don't ever call user defined callbacks
+# nor use any FPU or SSE* and as such don't need bigger %esp alignment
+# than 4 bytes.
+# Lots of routines in math will use FPU, so make math subdir an exception
+# here.
+# In gcc 4.6 (and maybe earlier?) giving -mpreferred-stack-boundary=2 is
+# an error, so don't try to reduce it here like we used to. We still
+# explicit set -mpreferred-stack-boundary=4 the places where it matters,
+# in case an older compiler defaulted to 2.
+ifeq ($(subdir),math)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+else
+ifeq ($(subdir),csu)
+sysdep-CFLAGS += -mpreferred-stack-boundary=4
+gen-as-const-headers += link-defines.sym
+else
+# Likewise, any function which calls user callbacks
+uses-callbacks += -mpreferred-stack-boundary=4
+# Likewise, any stack alignment tests
+stack-align-test-flags += -malign-double -mpreferred-stack-boundary=4
+endif
+endif
+
+# And a couple of other routines
+ifeq ($(subdir),stdlib)
+CFLAGS-exit.c += -mpreferred-stack-boundary=4
+CFLAGS-cxa_finalize.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),elf)
+CFLAGS-dl-init.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-fini.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-open.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-close.c += -mpreferred-stack-boundary=4
+CFLAGS-dl-error.c += -mpreferred-stack-boundary=4
+endif
+ifeq ($(subdir),dlfcn)
+CFLAGS-dlopen.c += -mpreferred-stack-boundary=4
+CFLAGS-dlopenold.c += -mpreferred-stack-boundary=4
+CFLAGS-dlclose.c += -mpreferred-stack-boundary=4
+CFLAGS-dlerror.c += -mpreferred-stack-boundary=4
+endif
+
+ifneq (,$(filter -mno-tls-direct-seg-refs,$(CFLAGS)))
+defines += -DNO_TLS_DIRECT_SEG_REFS
+endif
+
+ifeq ($(subdir),elf)
+sysdep-dl-routines += tlsdesc dl-tlsdesc
+
+tests += tst-audit3
+modules-names += tst-auditmod3a tst-auditmod3b
+
+$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
+$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
+tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
+endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
+
+# Make sure no code in ld.so uses mm/xmm/ymm/zmm registers on i386 since
+# the first 3 mm/xmm/ymm/zmm registers are used to pass vector parameters
+# which must be preserved.
+# With SSE disabled, ensure -fpmath is not set to use sse either.
+rtld-CFLAGS += -mno-sse -mno-mmx -mfpmath=387
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+ $(rtld-CFLAGS))
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/i386/tst-ld-sse-use.sh $(objpfx)ld.so
+ @echo "Checking ld.so for SSE register use. This will take a few seconds..."
+ $(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+ $(evaluate-test)
+else
+CFLAGS-.os += $(if $(filter rtld-%.os,$(@F)), $(rtld-CFLAGS))
+endif
diff --git a/REORG.TODO/sysdeps/i386/Versions b/REORG.TODO/sysdeps/i386/Versions
new file mode 100644
index 0000000000..7be44aad7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/Versions
@@ -0,0 +1,35 @@
+ld {
+ GLIBC_2.3 {
+ # The alternative i386 runtime interface to TLS.
+ ___tls_get_addr;
+ }
+}
+libc {
+ GLIBC_2.0 {
+ # Functions from libgcc.
+ __divdi3; __moddi3; __udivdi3; __umoddi3;
+ }
+ GLIBC_2.1 {
+ # global variable
+ _fp_hw;
+ }
+ GLIBC_2.1.1 {
+ # extern inline functions used by <bits/string.h>
+ __memcpy_c; __memset_cc; __memset_cg; __memset_gg;
+ __memcpy_by2; __memcpy_by4; __memcpy_g; __mempcpy_by2; __mempcpy_by4;
+ __mempcpy_byn; __memset_ccn_by2; __memset_ccn_by4; __memset_gcn_by2;
+ __memset_gcn_by4; __stpcpy_g; __strcat_c; __strcat_g; __strchr_c;
+ __strchr_g; __strchrnul_c; __strchrnul_g; __strcmp_gg; __strcpy_g;
+ __strcspn_c1; __strcspn_cg; __strcspn_g; __strlen_g; __strncat_g;
+ __strncmp_g; __strncpy_by2; __strncpy_by4; __strncpy_byn; __strncpy_gg;
+ __strpbrk_cg; __strpbrk_g; __strrchr_c; __strrchr_g; __strspn_c1;
+ __strspn_cg; __strspn_g; __strstr_cg; __strstr_g;
+ }
+}
+libm {
+ GLIBC_2.1 {
+ # A generic bug got this omitted from other configurations' version
+ # sets, but we always had it.
+ exp2l;
+ }
+}
diff --git a/REORG.TODO/sysdeps/i386/____longjmp_chk.S b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
new file mode 100644
index 0000000000..0910861a9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/____longjmp_chk.S
@@ -0,0 +1 @@
+#error "OS-specific version needed"
diff --git a/REORG.TODO/sysdeps/i386/__longjmp.S b/REORG.TODO/sysdeps/i386/__longjmp.S
new file mode 100644
index 0000000000..3719763cd6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/__longjmp.S
@@ -0,0 +1,72 @@
+/* longjmp for i386.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+ .text
+ENTRY (__longjmp)
+#ifdef PTR_DEMANGLE
+ movl 4(%esp), %eax /* User's jmp_buf in %eax. */
+
+ /* Save the return address now. */
+ movl (JB_PC*4)(%eax), %edx
+ /* Get the stack pointer. */
+ movl (JB_SP*4)(%eax), %ecx
+ PTR_DEMANGLE (%edx)
+ PTR_DEMANGLE (%ecx)
+ LIBC_PROBE (longjmp, 3, 4@%eax, -4@8(%esp), 4@%edx)
+ cfi_def_cfa(%eax, 0)
+ cfi_register(%eip, %edx)
+ cfi_register(%esp, %ecx)
+ cfi_offset(%ebx, JB_BX*4)
+ cfi_offset(%esi, JB_SI*4)
+ cfi_offset(%edi, JB_DI*4)
+ cfi_offset(%ebp, JB_BP*4)
+ /* Restore registers. */
+ movl (JB_BX*4)(%eax), %ebx
+ movl (JB_SI*4)(%eax), %esi
+ movl (JB_DI*4)(%eax), %edi
+ movl (JB_BP*4)(%eax), %ebp
+ cfi_restore(%ebx)
+ cfi_restore(%esi)
+ cfi_restore(%edi)
+ cfi_restore(%ebp)
+
+ LIBC_PROBE (longjmp_target, 3, 4@%eax, -4@8(%esp), 4@%edx)
+ movl 8(%esp), %eax /* Second argument is return value. */
+ movl %ecx, %esp
+#else
+ movl 4(%esp), %ecx /* User's jmp_buf in %ecx. */
+ movl 8(%esp), %eax /* Second argument is return value. */
+ /* Save the return address now. */
+ movl (JB_PC*4)(%ecx), %edx
+ LIBC_PROBE (longjmp, 3, 4@%ecx, -4@%eax, 4@%edx)
+ /* Restore registers. */
+ movl (JB_BX*4)(%ecx), %ebx
+ movl (JB_SI*4)(%ecx), %esi
+ movl (JB_DI*4)(%ecx), %edi
+ movl (JB_BP*4)(%ecx), %ebp
+ movl (JB_SP*4)(%ecx), %esp
+ LIBC_PROBE (longjmp_target, 3, 4@%ecx, -4@%ecx, 4@%edx)
+#endif
+ /* Jump to saved PC. */
+ jmp *%edx
+END (__longjmp)
diff --git a/REORG.TODO/sysdeps/i386/abort-instr.h b/REORG.TODO/sysdeps/i386/abort-instr.h
new file mode 100644
index 0000000000..810f10379b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/abort-instr.h
@@ -0,0 +1,2 @@
+/* An instruction which should crash any program is `hlt'. */
+#define ABORT_INSTRUCTION asm ("hlt")
diff --git a/REORG.TODO/sysdeps/i386/add_n.S b/REORG.TODO/sysdeps/i386/add_n.S
new file mode 100644
index 0000000000..c2923094a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/add_n.S
@@ -0,0 +1,111 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+ limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_add_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 4)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 0)
+ movl S2(%esp),%edx
+ movl SIZE(%esp),%ecx
+ movl %ecx,%eax
+ shrl $3,%ecx /* compute count for unrolled loop */
+ negl %eax
+ andl $7,%eax /* get index where to start loop */
+ jz L(oop) /* necessary special case for 0 */
+ incl %ecx /* adjust loop count */
+ shll $2,%eax /* adjustment for pointers... */
+ subl %eax,%edi /* ... since they are offset ... */
+ subl %eax,%esi /* ... by a constant when we ... */
+ subl %eax,%edx /* ... enter the loop */
+ shrl $2,%eax /* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC. Due to limitations in some
+ assemblers, Loop-L0-3 cannot be put into the leal */
+ call L(0)
+ cfi_adjust_cfa_offset (4)
+L(0): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $(L(oop)-L(0)-3),%eax
+ addl $4,%esp
+ cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC. */
+ leal (L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+ jmp *%eax /* jump into loop */
+ ALIGN (3)
+L(oop): movl (%esi),%eax
+ adcl (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ adcl 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ adcl 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ adcl 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ adcl 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ adcl 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ adcl 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ adcl 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/addmul_1.S b/REORG.TODO/sysdeps/i386/addmul_1.S
new file mode 100644
index 0000000000..ad90ea53e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/addmul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ the result to a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_addmul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %sizeP
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%sizeP,4), %res_ptr
+ leal (%s1_ptr,%sizeP,4), %s1_ptr
+ negl %sizeP
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+L(oop):
+ movl (%s1_ptr,%sizeP,4), %eax
+ mull %s2_limb
+ addl %ebp, %eax
+ adcl $0, %edx
+ addl %eax, (%res_ptr,%sizeP,4)
+ adcl $0, %edx
+ movl %edx, %ebp
+
+ incl %sizeP
+ jnz L(oop)
+ movl %ebp, %eax
+
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/asm-syntax.h b/REORG.TODO/sysdeps/i386/asm-syntax.h
new file mode 100644
index 0000000000..a992da2dd1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/asm-syntax.h
@@ -0,0 +1,24 @@
+/* Definitions for x86 syntax variations.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library. Its master source is NOT part of
+ the C library, however. The master source lives in the GNU MP Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#undef ALIGN
+#define ALIGN(log) .align 1<<log
+
+#undef L
+#define L(body) .L##body
diff --git a/REORG.TODO/sysdeps/i386/atomic-machine.h b/REORG.TODO/sysdeps/i386/atomic-machine.h
new file mode 100644
index 0000000000..0e24200617
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/atomic-machine.h
@@ -0,0 +1,545 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+#include <tls.h> /* For tcbhead_t. */
+
+
+typedef int8_t atomic8_t;
+typedef uint8_t uatomic8_t;
+typedef int_fast8_t atomic_fast8_t;
+typedef uint_fast8_t uatomic_fast8_t;
+
+typedef int16_t atomic16_t;
+typedef uint16_t uatomic16_t;
+typedef int_fast16_t atomic_fast16_t;
+typedef uint_fast16_t uatomic_fast16_t;
+
+typedef int32_t atomic32_t;
+typedef uint32_t uatomic32_t;
+typedef int_fast32_t atomic_fast32_t;
+typedef uint_fast32_t uatomic_fast32_t;
+
+typedef int64_t atomic64_t;
+typedef uint64_t uatomic64_t;
+typedef int_fast64_t atomic_fast64_t;
+typedef uint_fast64_t uatomic_fast64_t;
+
+typedef intptr_t atomicptr_t;
+typedef uintptr_t uatomicptr_t;
+typedef intmax_t atomic_max_t;
+typedef uintmax_t uatomic_max_t;
+
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+# define LOCK_PREFIX /* nothing */
+# else
+# define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+#define __HAVE_64B_ATOMICS 0
+#define USE_ATOMIC_COMPILER_BUILTINS 0
+#define ATOMIC_EXCHANGE_USES_CAS 0
+
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+ __sync_val_compare_and_swap (mem, oldval, newval)
+#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
+ (! __sync_bool_compare_and_swap (mem, oldval, newval))
+
+
+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \
+ "je 0f\n\t" \
+ "lock\n" \
+ "0:\tcmpxchgb %b2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "q" (newval), "m" (*mem), "0" (oldval), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ ret; })
+
+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \
+ "je 0f\n\t" \
+ "lock\n" \
+ "0:\tcmpxchgw %w2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "r" (newval), "m" (*mem), "0" (oldval), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ ret; })
+
+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \
+ "je 0f\n\t" \
+ "lock\n" \
+ "0:\tcmpxchgl %2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "r" (newval), "m" (*mem), "0" (oldval), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ ret; })
+
+/* XXX We do not really need 64-bit compare-and-exchange. At least
+ not in the moment. Using it would mean causing portability
+ problems since not many other 32-bit architectures have support for
+ such an operation. So don't define any code for now. If it is
+ really going to be used the code below can be used on Intel Pentium
+ and later, but NOT on i486. */
+#if 1
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret = *(mem); \
+ abort (); \
+ ret = (newval); \
+ ret = (oldval); \
+ ret; })
+# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret = *(mem); \
+ abort (); \
+ ret = (newval); \
+ ret = (oldval); \
+ ret; })
+#else
+# ifdef __PIC__
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("xchgl %2, %%ebx\n\t" \
+ LOCK_PREFIX "cmpxchg8b %1\n\t" \
+ "xchgl %2, %%ebx" \
+ : "=A" (ret), "=m" (*mem) \
+ : "DS" (((unsigned long long int) (newval)) \
+ & 0xffffffff), \
+ "c" (((unsigned long long int) (newval)) >> 32), \
+ "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+ & 0xffffffff), \
+ "d" (((unsigned long long int) (oldval)) >> 32)); \
+ ret; })
+
+# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("xchgl %2, %%ebx\n\t" \
+ "cmpl $0, %%gs:%P7\n\t" \
+ "je 0f\n\t" \
+ "lock\n" \
+ "0:\tcmpxchg8b %1\n\t" \
+ "xchgl %2, %%ebx" \
+ : "=A" (ret), "=m" (*mem) \
+ : "DS" (((unsigned long long int) (newval)) \
+ & 0xffffffff), \
+ "c" (((unsigned long long int) (newval)) >> 32), \
+ "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+ & 0xffffffff), \
+ "d" (((unsigned long long int) (oldval)) >> 32), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ ret; })
+# else
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile (LOCK_PREFIX "cmpxchg8b %1" \
+ : "=A" (ret), "=m" (*mem) \
+ : "b" (((unsigned long long int) (newval)) \
+ & 0xffffffff), \
+ "c" (((unsigned long long int) (newval)) >> 32), \
+ "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+ & 0xffffffff), \
+ "d" (((unsigned long long int) (oldval)) >> 32)); \
+ ret; })
+
+# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile ("cmpl $0, %%gs:%P7\n\t" \
+ "je 0f\n\t" \
+ "lock\n" \
+ "0:\tcmpxchg8b %1" \
+ : "=A" (ret), "=m" (*mem) \
+ : "b" (((unsigned long long int) (newval)) \
+ & 0xffffffff), \
+ "c" (((unsigned long long int) (newval)) >> 32), \
+ "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+ & 0xffffffff), \
+ "d" (((unsigned long long int) (oldval)) >> 32), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ ret; })
+# endif
+#endif
+
+
+/* Note that we need no lock prefix. */
+#define atomic_exchange_acq(mem, newvalue) \
+ ({ __typeof (*mem) result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile ("xchgb %b0, %1" \
+ : "=q" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile ("xchgw %w0, %1" \
+ : "=r" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile ("xchgl %0, %1" \
+ : "=r" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else \
+ { \
+ result = 0; \
+ abort (); \
+ } \
+ result; })
+
+
+#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
+ ({ __typeof (*mem) __result; \
+ __typeof (value) __addval = (value); \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "xaddb %b0, %1" \
+ : "=q" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "xaddw %w0, %1" \
+ : "=r" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "xaddl %0, %1" \
+ : "=r" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __tmpval; \
+ __result = *__memp; \
+ do \
+ __tmpval = __result; \
+ while ((__result = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __result + __addval, __result)) == __tmpval); \
+ } \
+ __result; })
+
+#define atomic_exchange_and_add(mem, value) \
+ __sync_fetch_and_add (mem, value)
+
+#define __arch_exchange_and_add_cprefix \
+ "cmpl $0, %%gs:%P4\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_exchange_and_add(mem, value) \
+ __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c, \
+ mem, value)
+
+
+#define __arch_add_body(lock, pfx, mem, value) \
+ do { \
+ if (__builtin_constant_p (value) && (value) == 1) \
+ atomic_increment (mem); \
+ else if (__builtin_constant_p (value) && (value) == -1) \
+ atomic_decrement (mem); \
+ else if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "addb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (value), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "addw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (value), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "addl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (value), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ { \
+ __typeof (value) __addval = (value); \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval + __addval, __oldval)) == __tmpval); \
+ } \
+ } while (0)
+
+#define atomic_add(mem, value) \
+ __arch_add_body (LOCK_PREFIX, __arch, mem, value)
+
+#define __arch_add_cprefix \
+ "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_add(mem, value) \
+ __arch_add_body (__arch_add_cprefix, __arch_c, mem, value)
+
+
+#define atomic_add_negative(mem, value) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "iq" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else \
+ abort (); \
+ __result; })
+
+
+#define atomic_add_zero(mem, value) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "iq" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else \
+ abort (); \
+ __result; })
+
+
+#define __arch_increment_body(lock, pfx, mem) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "incb %b0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "incw %w0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "incl %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval + 1, __oldval)) == __tmpval); \
+ } \
+ } while (0)
+
+#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_increment_cprefix \
+ "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_increment(mem) \
+ __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
+
+
+#define atomic_increment_and_test(mem) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "incb %0; sete %b1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "incw %0; sete %w1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "incl %0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else \
+ abort (); \
+ __result; })
+
+
+#define __arch_decrement_body(lock, pfx, mem) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "decb %b0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "decw %w0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "decl %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval - 1, __oldval)) == __tmpval); \
+ } \
+ } while (0)
+
+#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
+
+#define __arch_decrement_cprefix \
+ "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t"
+
+#define catomic_decrement(mem) \
+ __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
+
+
+#define atomic_decrement_and_test(mem) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "decb %b0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "decw %w0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "decl %0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else \
+ abort (); \
+ __result; })
+
+
+#define atomic_bit_set(mem, bit) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "orb %b2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "iq" (1 << (bit))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "orw %w2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "ir" (1 << (bit))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "orl %2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "ir" (1 << (bit))); \
+ else \
+ abort (); \
+ } while (0)
+
+
+#define atomic_bit_test_set(mem, bit) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else \
+ abort (); \
+ __result; })
+
+
+#define atomic_spin_nop() asm ("rep; nop")
+
+
+#define __arch_and_body(lock, mem, mask) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "andb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "andw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "andl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ abort (); \
+ } while (0)
+
+#define __arch_cprefix \
+ "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t"
+
+#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
+
+
+#define __arch_or_body(lock, mem, mask) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "orb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "orw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "orl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem), \
+ "i" (offsetof (tcbhead_t, multiple_threads))); \
+ else \
+ abort (); \
+ } while (0)
+
+#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+
+#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
+
+/* We don't use mfence because it is supposedly slower due to having to
+ provide stronger guarantees (e.g., regarding self-modifying code). */
+#define atomic_full_barrier() \
+ __asm __volatile (LOCK_PREFIX "orl $0, (%%esp)" ::: "memory")
+#define atomic_read_barrier() __asm ("" ::: "memory")
+#define atomic_write_barrier() __asm ("" ::: "memory")
diff --git a/REORG.TODO/sysdeps/i386/backtrace.c b/REORG.TODO/sysdeps/i386/backtrace.c
new file mode 100644
index 0000000000..ee8238d0ce
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/backtrace.c
@@ -0,0 +1,163 @@
+/* Return backtrace of current program state.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <libc-lock.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <unwind.h>
+
+struct trace_arg
+{
+ void **array;
+ int cnt, size;
+ void *lastebp, *lastesp;
+};
+
+#ifdef SHARED
+static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *);
+static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getcfa) (struct _Unwind_Context *);
+static _Unwind_Ptr (*unwind_getgr) (struct _Unwind_Context *, int);
+static void *libgcc_handle;
+
+static void
+init (void)
+{
+ libgcc_handle = __libc_dlopen ("libgcc_s.so.1");
+
+ if (libgcc_handle == NULL)
+ return;
+
+ unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace");
+ unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP");
+ unwind_getcfa = __libc_dlsym (libgcc_handle, "_Unwind_GetCFA");
+ unwind_getgr = __libc_dlsym (libgcc_handle, "_Unwind_GetGR");
+ if (unwind_getip == NULL || unwind_getgr == NULL || unwind_getcfa == NULL)
+ {
+ unwind_backtrace = NULL;
+ __libc_dlclose (libgcc_handle);
+ libgcc_handle = NULL;
+ }
+}
+#else
+# define unwind_backtrace _Unwind_Backtrace
+# define unwind_getip _Unwind_GetIP
+# define unwind_getcfa _Unwind_GetCFA
+# define unwind_getgr _Unwind_GetGR
+#endif
+
+static _Unwind_Reason_Code
+backtrace_helper (struct _Unwind_Context *ctx, void *a)
+{
+ struct trace_arg *arg = a;
+
+ /* We are first called with address in the __backtrace function.
+ Skip it. */
+ if (arg->cnt != -1)
+ arg->array[arg->cnt] = (void *) unwind_getip (ctx);
+ if (++arg->cnt == arg->size)
+ return _URC_END_OF_STACK;
+
+ /* %ebp is DWARF2 register 5 on IA-32. */
+ arg->lastebp = (void *) unwind_getgr (ctx, 5);
+ arg->lastesp = (void *) unwind_getcfa (ctx);
+ return _URC_NO_REASON;
+}
+
+
+/* This is a global variable set at program start time. It marks the
+ highest used stack address. */
+extern void *__libc_stack_end;
+
+
+/* This is the stack layout we see with every stack frame
+ if not compiled without frame pointer.
+
+ +-----------------+ +-----------------+
+ %ebp -> | %ebp last frame--------> | %ebp last frame--->...
+ | | | |
+ | return address | | return address |
+ +-----------------+ +-----------------+
+
+ First try as far to get as far as possible using
+ _Unwind_Backtrace which handles -fomit-frame-pointer
+ as well, but requires .eh_frame info. Then fall back to
+ walking the stack manually. */
+
+struct layout
+{
+ struct layout *ebp;
+ void *ret;
+};
+
+
+int
+__backtrace (void **array, int size)
+{
+ struct trace_arg arg = { .array = array, .size = size, .cnt = -1 };
+
+ if (size <= 0)
+ return 0;
+
+#ifdef SHARED
+ __libc_once_define (static, once);
+
+ __libc_once (once, init);
+ if (unwind_backtrace == NULL)
+ return 0;
+#endif
+
+ unwind_backtrace (backtrace_helper, &arg);
+
+ if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL)
+ --arg.cnt;
+ else if (arg.cnt < size)
+ {
+ struct layout *ebp = (struct layout *) arg.lastebp;
+
+ while (arg.cnt < size)
+ {
+ /* Check for out of range. */
+ if ((void *) ebp < arg.lastesp || (void *) ebp > __libc_stack_end
+ || ((long) ebp & 3))
+ break;
+
+ array[arg.cnt++] = ebp->ret;
+ ebp = ebp->ebp;
+ }
+ }
+ return arg.cnt != -1 ? arg.cnt : 0;
+}
+weak_alias (__backtrace, backtrace)
+libc_hidden_def (__backtrace)
+
+
+#ifdef SHARED
+/* Free all resources if necessary. */
+libc_freeres_fn (free_mem)
+{
+ unwind_backtrace = NULL;
+ if (libgcc_handle != NULL)
+ {
+ __libc_dlclose (libgcc_handle);
+ libgcc_handle = NULL;
+ }
+}
+#endif
diff --git a/REORG.TODO/sysdeps/i386/bcopy.S b/REORG.TODO/sysdeps/i386/bcopy.S
new file mode 100644
index 0000000000..12b8ddb886
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bcopy.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY bcopy
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/bsd-_setjmp.S b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
new file mode 100644
index 0000000000..6496304946
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S
@@ -0,0 +1,56 @@
+/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'. i386 version.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 0)'.
+ We cannot do it in C because it must be a tail-call, so frame-unwinding
+ in setjmp doesn't clobber the state restored by longjmp. */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define JMPBUF PARMS
+#define SIGMSK JMPBUF+4
+
+ENTRY (_setjmp)
+
+ xorl %eax, %eax
+ movl JMPBUF(%esp), %edx
+
+ /* Save registers. */
+ movl %ebx, (JB_BX*4)(%edx)
+ movl %esi, (JB_SI*4)(%edx)
+ movl %edi, (JB_DI*4)(%edx)
+ leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_SP*4)(%edx)
+ movl 0(%esp), %ecx /* Save PC we are returning to now. */
+ LIBC_PROBE (setjmp, 3, 4@%edx, -4@$0, 4@%ecx)
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_PC*4)(%edx)
+ movl %ebp, (JB_BP*4)(%edx) /* Save caller's frame pointer. */
+
+ movl %eax, JB_SIZE(%edx) /* No signal mask set. */
+ ret
+END (_setjmp)
+libc_hidden_def (_setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bsd-setjmp.S b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
new file mode 100644
index 0000000000..5710e1f42b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bsd-setjmp.S
@@ -0,0 +1,66 @@
+/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'. i386 version.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This just does a tail-call to `__sigsetjmp (ARG, 1)'.
+ We cannot do it in C because it must be a tail-call, so frame-unwinding
+ in setjmp doesn't clobber the state restored by longjmp. */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <stap-probe.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define JMPBUF PARMS
+#define SIGMSK JMPBUF+4
+
+ENTRY (setjmp)
+ /* Note that we have to use a non-exported symbol in the next
+ jump since otherwise gas will emit it as a jump through the
+ PLT which is what we cannot use here. */
+
+ movl JMPBUF(%esp), %eax
+
+ /* Save registers. */
+ movl %ebx, (JB_BX*4)(%eax)
+ movl %esi, (JB_SI*4)(%eax)
+ movl %edi, (JB_DI*4)(%eax)
+ leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_SP*4)(%eax)
+ movl 0(%esp), %ecx /* Save PC we are returning to now. */
+ LIBC_PROBE (setjmp, 3, 4@%eax, -4@$1, 4@%ecx)
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_PC*4)(%eax)
+ movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */
+
+ /* Call __sigjmp_save. */
+ pushl $1
+ cfi_adjust_cfa_offset (4)
+ pushl 8(%esp)
+ cfi_adjust_cfa_offset (4)
+ call __sigjmp_save
+ popl %ecx
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ ret
+END (setjmp)
diff --git a/REORG.TODO/sysdeps/i386/bzero.S b/REORG.TODO/sysdeps/i386/bzero.S
new file mode 100644
index 0000000000..c8dd47b4da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/bzero.S
@@ -0,0 +1,5 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include "memset.S"
+
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/cacheinfo.c b/REORG.TODO/sysdeps/i386/cacheinfo.c
new file mode 100644
index 0000000000..f15fe0779a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/cacheinfo.c
@@ -0,0 +1,3 @@
+#define DISABLE_PREFETCHW
+
+#include <sysdeps/x86/cacheinfo.c>
diff --git a/REORG.TODO/sysdeps/i386/configure b/REORG.TODO/sysdeps/i386/configure
new file mode 100644
index 0000000000..5b55c5affe
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure
@@ -0,0 +1,84 @@
+# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
+ # Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+ as_fn_error $? "
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ...\"" "$LINENO" 5
+fi
+
+# The GNU C Library can't be built for i386. There are several reasons for
+# this restriction. The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation. While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this. Even with NPTL disabled you still
+# have no atomic.h implementation. Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for compiler support of inlined builtin function __sync_val_compare_and_swap" >&5
+$as_echo_n "checking for compiler support of inlined builtin function __sync_val_compare_and_swap... " >&6; }
+libc_compiler_builtin_inlined=no
+cat > conftest.c <<EOF
+int _start (void) { int a, b, c; __sync_val_compare_and_swap (&a, b, c); return 0; }
+EOF
+if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS
+ -O0 -nostdlib -nostartfiles
+ -S conftest.c -o - | fgrep "__sync_val_compare_and_swap"
+ 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }
+then
+ libc_compiler_builtin_inlined=yes
+fi
+rm -f conftest*
+if test $libc_compiler_builtin_inlined = yes; then
+ libc_cv_unsupported_i386=no
+else
+ as_fn_error $? "
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ..." "$LINENO" 5
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_compiler_builtin_inlined" >&5
+$as_echo "$libc_compiler_builtin_inlined" >&6; }
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat > conftest.s <<\EOF
+ bndmov %bnd0,(%esp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then
+ libc_cv_asm_mpx=yes
+else
+ libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx = yes; then
+ $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
+$as_echo "#define USE_REGPARMS 1" >>confdefs.h
+
+
+$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
+
diff --git a/REORG.TODO/sysdeps/i386/configure.ac b/REORG.TODO/sysdeps/i386/configure.ac
new file mode 100644
index 0000000000..19ef33f34a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/configure.ac
@@ -0,0 +1,52 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/i386.
+
+# We no longer support i386 since it lacks the atomic instructions
+# required to implement NPTL threading.
+if test "$config_machine" = i386; then
+ AC_MSG_ERROR([
+*** ERROR: Support for i386 is deprecated.
+*** Please use host i786, i686, i585 or i486.
+*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ..."])
+fi
+
+# The GNU C Library can't be built for i386. There are several reasons for
+# this restriction. The primary reason is that i386 lacks the atomic
+# operations required to support the current NPTL implementation. While it is
+# possible that such atomic operations could be emulated in the kernel to date
+# no such work has been done to enable this. Even with NPTL disabled you still
+# have no atomic.h implementation. Given the declining use of i386 we disable
+# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly
+# check for i386, instead we make sure the compiler has support for inlining
+# the builtin __sync_val_compare_and_swap. If it does then we should have no
+# problem building for i386.
+LIBC_COMPILER_BUILTIN_INLINED(
+ [__sync_val_compare_and_swap],
+ [int a, b, c; __sync_val_compare_and_swap (&a, b, c);],
+ [-O0],
+ [libc_cv_unsupported_i386=no],
+ [AC_MSG_ERROR([
+*** Building with -march=i386/-mcpu=i386 is not supported.
+*** Please use host i786, i686, i586, or i486.
+*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ...])])
+
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+ bndmov %bnd0,(%esp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+ libc_cv_asm_mpx=yes
+else
+ libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx = yes; then
+ AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
+AC_DEFINE(USE_REGPARMS)
+
+dnl It is always possible to access static and hidden symbols in an
+dnl position independent way.
+AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/REORG.TODO/sysdeps/i386/crti.S b/REORG.TODO/sysdeps/i386/crti.S
new file mode 100644
index 0000000000..f800209990
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crti.S
@@ -0,0 +1,84 @@
+/* Special .init and .fini section support for x86.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file with other
+ programs, and to distribute those programs without any restriction
+ coming from the use of this file. (The GNU Lesser General Public
+ License restrictions do apply in other respects; for example, they
+ cover modification of the file, and distribution when not linked
+ into another program.)
+
+ Note that people who make modified versions of this file are not
+ obligated to grant this special exception for their modified
+ versions; it is their choice whether to do so. The GNU Lesser
+ General Public License gives permission to release a modified
+ version without this exception; this exception also makes it
+ possible to release a modified version which carries forward this
+ exception.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* crti.S puts a function prologue at the beginning of the .init and
+ .fini sections and defines global symbols for those addresses, so
+ they can be called as functions. The symbols _init and _fini are
+ magic and cause the linker to emit DT_INIT and DT_FINI. */
+
+#include <libc-symbols.h>
+#include <sysdep.h>
+
+#ifndef PREINIT_FUNCTION
+# define PREINIT_FUNCTION __gmon_start__
+#endif
+
+#ifndef PREINIT_FUNCTION_WEAK
+# define PREINIT_FUNCTION_WEAK 1
+#endif
+
+#if PREINIT_FUNCTION_WEAK
+ weak_extern (PREINIT_FUNCTION)
+#else
+ .hidden PREINIT_FUNCTION
+#endif
+
+ .section .init,"ax",@progbits
+ .p2align 2
+ .globl _init
+ .type _init, @function
+_init:
+ pushl %ebx
+ /* Maintain 16-byte stack alignment for called functions. */
+ subl $8, %esp
+ LOAD_PIC_REG (bx)
+#if PREINIT_FUNCTION_WEAK
+ movl PREINIT_FUNCTION@GOT(%ebx), %eax
+ testl %eax, %eax
+ je .Lno_weak_fn
+ call PREINIT_FUNCTION@PLT
+.Lno_weak_fn:
+#else
+ call PREINIT_FUNCTION
+#endif
+
+ .section .fini,"ax",@progbits
+ .p2align 2
+ .globl _fini
+ .type _fini, @function
+_fini:
+ pushl %ebx
+ subl $8, %esp
+ LOAD_PIC_REG (bx)
diff --git a/REORG.TODO/sysdeps/i386/crtn.S b/REORG.TODO/sysdeps/i386/crtn.S
new file mode 100644
index 0000000000..b18b9c171a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/crtn.S
@@ -0,0 +1,47 @@
+/* Special .init and .fini section support for x86.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file with other
+ programs, and to distribute those programs without any restriction
+ coming from the use of this file. (The GNU Lesser General Public
+ License restrictions do apply in other respects; for example, they
+ cover modification of the file, and distribution when not linked
+ into another program.)
+
+ Note that people who make modified versions of this file are not
+ obligated to grant this special exception for their modified
+ versions; it is their choice whether to do so. The GNU Lesser
+ General Public License gives permission to release a modified
+ version without this exception; this exception also makes it
+ possible to release a modified version which carries forward this
+ exception.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* crtn.S puts function epilogues in the .init and .fini sections
+ corresponding to the prologues in crti.S. */
+
+ .section .init,"ax",@progbits
+ addl $8, %esp
+ popl %ebx
+ ret
+
+ .section .fini,"ax",@progbits
+ addl $8, %esp
+ popl %ebx
+ ret
diff --git a/REORG.TODO/sysdeps/i386/dl-irel.h b/REORG.TODO/sysdeps/i386/dl-irel.h
new file mode 100644
index 0000000000..824e81aed1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-irel.h
@@ -0,0 +1,51 @@
+/* Machine-dependent ELF indirect relocation inline functions.
+ i386 version.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _DL_IREL_H
+#define _DL_IREL_H
+
+#include <stdio.h>
+#include <unistd.h>
+
+#define ELF_MACHINE_IREL 1
+
+static inline Elf32_Addr
+__attribute ((always_inline))
+elf_ifunc_invoke (Elf32_Addr addr)
+{
+ return ((Elf32_Addr (*) (void)) (addr)) ();
+}
+
+static inline void
+__attribute ((always_inline))
+elf_irel (const Elf32_Rel *reloc)
+{
+ Elf32_Addr *const reloc_addr = (void *) reloc->r_offset;
+ const unsigned long int r_type = ELF32_R_TYPE (reloc->r_info);
+
+ if (__glibc_likely (r_type == R_386_IRELATIVE))
+ {
+ Elf32_Addr value = elf_ifunc_invoke(*reloc_addr);
+ *reloc_addr = value;
+ }
+ else
+ __libc_fatal ("unexpected reloc type in static binary");
+}
+
+#endif /* dl-irel.h */
diff --git a/REORG.TODO/sysdeps/i386/dl-lookupcfg.h b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
new file mode 100644
index 0000000000..47b534a059
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h
@@ -0,0 +1,32 @@
+/* Configuration of lookup functions.
+ Copyright (C) 2005-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define DL_UNMAP_IS_SPECIAL
+
+#include_next <dl-lookupcfg.h>
+
+/* Address of protected data defined in the shared library may be
+ external due to copy relocation. */
+#define DL_EXTERN_PROTECTED_DATA
+
+struct link_map;
+
+extern void _dl_unmap (struct link_map *map)
+ internal_function attribute_hidden;
+
+#define DL_UNMAP(map) _dl_unmap (map)
diff --git a/REORG.TODO/sysdeps/i386/dl-machine.h b/REORG.TODO/sysdeps/i386/dl-machine.h
new file mode 100644
index 0000000000..57d4a0bdbd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-machine.h
@@ -0,0 +1,757 @@
+/* Machine-dependent ELF dynamic relocation inline functions. i386 version.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef dl_machine_h
+#define dl_machine_h
+
+#define ELF_MACHINE_NAME "i386"
+
+#include <sys/param.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <cpu-features.c>
+
+/* Return nonzero iff ELF header is compatible with the running host. */
+static inline int __attribute__ ((unused))
+elf_machine_matches_host (const Elf32_Ehdr *ehdr)
+{
+ return ehdr->e_machine == EM_386;
+}
+
+
+/* Return the link-time address of _DYNAMIC. Conveniently, this is the
+ first element of the GOT, a special entry that is never relocated. */
+static inline Elf32_Addr __attribute__ ((unused, const))
+elf_machine_dynamic (void)
+{
+ /* This produces a GOTOFF reloc that resolves to zero at link time, so in
+ fact just loads from the GOT register directly. By doing it without
+ an asm we can let the compiler choose any register. */
+ extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
+ return _GLOBAL_OFFSET_TABLE_[0];
+}
+
+/* Return the run-time load address of the shared object. */
+static inline Elf32_Addr __attribute__ ((unused))
+elf_machine_load_address (void)
+{
+ /* Compute the difference between the runtime address of _DYNAMIC as seen
+ by a GOTOFF reference, and the link-time address found in the special
+ unrelocated first GOT entry. */
+ extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
+ return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
+}
+
+/* Set up the loaded object described by L so its unrelocated PLT
+ entries will jump to the on-demand fixup code in dl-runtime.c. */
+
+static inline int __attribute__ ((unused, always_inline))
+elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
+{
+ Elf32_Addr *got;
+ extern void _dl_runtime_resolve (Elf32_Word) attribute_hidden;
+ extern void _dl_runtime_profile (Elf32_Word) attribute_hidden;
+
+ if (l->l_info[DT_JMPREL] && lazy)
+ {
+ /* The GOT entries for functions in the PLT have not yet been filled
+ in. Their initial contents will arrange when called to push an
+ offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1],
+ and then jump to _GLOBAL_OFFSET_TABLE[2]. */
+ got = (Elf32_Addr *) D_PTR (l, l_info[DT_PLTGOT]);
+ /* If a library is prelinked but we have to relocate anyway,
+ we have to be able to undo the prelinking of .got.plt.
+ The prelinker saved us here address of .plt + 0x16. */
+ if (got[1])
+ {
+ l->l_mach.plt = got[1] + l->l_addr;
+ l->l_mach.gotplt = (Elf32_Addr) &got[3];
+ }
+ got[1] = (Elf32_Addr) l; /* Identify this shared object. */
+
+ /* The got[2] entry contains the address of a function which gets
+ called to get the address of a so far unresolved function and
+ jump to it. The profiling extension of the dynamic linker allows
+ to intercept the calls to collect information. In this case we
+ don't store the address in the GOT so that all future calls also
+ end in this function. */
+ if (__glibc_unlikely (profile))
+ {
+ got[2] = (Elf32_Addr) &_dl_runtime_profile;
+
+ if (GLRO(dl_profile) != NULL
+ && _dl_name_match_p (GLRO(dl_profile), l))
+ /* This is the object we are looking for. Say that we really
+ want profiling and the timers are started. */
+ GL(dl_profile_map) = l;
+ }
+ else
+ /* This function will get called to fix up the GOT entry indicated by
+ the offset on the stack, and then jump to the resolved address. */
+ got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+ }
+
+ return lazy;
+}
+
+#ifdef IN_DL_RUNTIME
+
+# ifndef PROF
+/* We add a declaration of this function here so that in dl-runtime.c
+ the ELF_MACHINE_RUNTIME_TRAMPOLINE macro really can pass the parameters
+ in registers.
+
+ We cannot use this scheme for profiling because the _mcount call
+ destroys the passed register information. */
+#define ARCH_FIXUP_ATTRIBUTE __attribute__ ((regparm (3), stdcall, unused))
+
+extern ElfW(Addr) _dl_fixup (struct link_map *l,
+ ElfW(Word) reloc_offset)
+ ARCH_FIXUP_ATTRIBUTE;
+extern ElfW(Addr) _dl_profile_fixup (struct link_map *l,
+ ElfW(Word) reloc_offset,
+ ElfW(Addr) retaddr, void *regs,
+ long int *framesizep)
+ ARCH_FIXUP_ATTRIBUTE;
+# endif
+
+#endif
+
+/* Mask identifying addresses reserved for the user program,
+ where the dynamic linker should not map anything. */
+#define ELF_MACHINE_USER_ADDRESS_MASK 0xf8000000UL
+
+/* Initial entry point code for the dynamic linker.
+ The C function `_dl_start' is the real entry point;
+ its return value is the user program's entry point. */
+
+#define RTLD_START asm ("\n\
+ .text\n\
+ .align 16\n\
+0: movl (%esp), %ebx\n\
+ ret\n\
+ .align 16\n\
+.globl _start\n\
+.globl _dl_start_user\n\
+_start:\n\
+ # Note that _dl_start gets the parameter in %eax.\n\
+ movl %esp, %eax\n\
+ call _dl_start\n\
+_dl_start_user:\n\
+ # Save the user entry point address in %edi.\n\
+ movl %eax, %edi\n\
+ # Point %ebx at the GOT.\n\
+ call 0b\n\
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx\n\
+ # See if we were run as a command with the executable file\n\
+ # name as an extra leading argument.\n\
+ movl _dl_skip_args@GOTOFF(%ebx), %eax\n\
+ # Pop the original argument count.\n\
+ popl %edx\n\
+ # Adjust the stack pointer to skip _dl_skip_args words.\n\
+ leal (%esp,%eax,4), %esp\n\
+ # Subtract _dl_skip_args from argc.\n\
+ subl %eax, %edx\n\
+ # Push argc back on the stack.\n\
+ push %edx\n\
+ # The special initializer gets called with the stack just\n\
+ # as the application's entry point will see it; it can\n\
+ # switch stacks if it moves these contents over.\n\
+" RTLD_START_SPECIAL_INIT "\n\
+ # Load the parameters again.\n\
+ # (eax, edx, ecx, *--esp) = (_dl_loaded, argc, argv, envp)\n\
+ movl _rtld_local@GOTOFF(%ebx), %eax\n\
+ leal 8(%esp,%edx,4), %esi\n\
+ leal 4(%esp), %ecx\n\
+ movl %esp, %ebp\n\
+ # Make sure _dl_init is run with 16 byte aligned stack.\n\
+ andl $-16, %esp\n\
+ pushl %eax\n\
+ pushl %eax\n\
+ pushl %ebp\n\
+ pushl %esi\n\
+ # Clear %ebp, so that even constructors have terminated backchain.\n\
+ xorl %ebp, %ebp\n\
+ # Call the function to run the initializers.\n\
+ call _dl_init\n\
+ # Pass our finalizer function to the user in %edx, as per ELF ABI.\n\
+ leal _dl_fini@GOTOFF(%ebx), %edx\n\
+ # Restore %esp _start expects.\n\
+ movl (%esp), %esp\n\
+ # Jump to the user's entry point.\n\
+ jmp *%edi\n\
+ .previous\n\
+");
+
+#ifndef RTLD_START_SPECIAL_INIT
+# define RTLD_START_SPECIAL_INIT /* nothing */
+#endif
+
+/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or
+ TLS variable, so undefined references should not be allowed to
+ define the value.
+ ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one
+ of the main executable's symbols, as for a COPY reloc.
+ ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may
+ against protected data whose address be external due to copy relocation.
+ */
+# define elf_machine_type_class(type) \
+ ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32 \
+ || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32 \
+ || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC) \
+ * ELF_RTYPE_CLASS_PLT) \
+ | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY) \
+ | (((type) == R_386_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+
+/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */
+#define ELF_MACHINE_JMP_SLOT R_386_JMP_SLOT
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+ Prelinked libraries may use Elf32_Rela though. */
+#define ELF_MACHINE_PLT_REL 1
+
+/* We define an initialization functions. This is called very early in
+ _dl_sysdep_start. */
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+#if IS_IN (rtld)
+ /* init_cpu_features has been called early from __libc_start_main in
+ static executable. */
+ init_cpu_features (&GLRO(dl_x86_cpu_features));
+#else
+ if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+ /* Avoid an empty string which would disturb us. */
+ GLRO(dl_platform) = NULL;
+#endif
+}
+
+static inline Elf32_Addr
+elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+ const Elf32_Rel *reloc,
+ Elf32_Addr *reloc_addr, Elf32_Addr value)
+{
+ return *reloc_addr = value;
+}
+
+/* Return the final value of a plt relocation. */
+static inline Elf32_Addr
+elf_machine_plt_value (struct link_map *map, const Elf32_Rel *reloc,
+ Elf32_Addr value)
+{
+ return value;
+}
+
+
+/* Names of the architecture-specific auditing callback functions. */
+#define ARCH_LA_PLTENTER i86_gnu_pltenter
+#define ARCH_LA_PLTEXIT i86_gnu_pltexit
+
+#endif /* !dl_machine_h */
+
+/* The i386 never uses Elf32_Rela relocations for the dynamic linker.
+ Prelinked libraries may use Elf32_Rela though. */
+#define ELF_MACHINE_NO_RELA defined RTLD_BOOTSTRAP
+#define ELF_MACHINE_NO_REL 0
+
+#ifdef RESOLVE_MAP
+
+/* Perform the relocation specified by RELOC and SYM (which is fully resolved).
+ MAP is the object containing the reloc. */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
+ const Elf32_Sym *sym, const struct r_found_version *version,
+ void *const reloc_addr_arg, int skip_ifunc)
+{
+ Elf32_Addr *const reloc_addr = reloc_addr_arg;
+ const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC
+ if (__glibc_unlikely (r_type == R_386_RELATIVE))
+ {
+# if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC
+ /* This is defined in rtld.c, but nowhere in the static libc.a;
+ make the reference weak so static programs can still link.
+ This declaration cannot be done when compiling rtld.c
+ (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the
+ common defn for _dl_rtld_map, which is incompatible with a
+ weak decl in the same file. */
+# ifndef SHARED
+ weak_extern (_dl_rtld_map);
+# endif
+ if (map != &GL(dl_rtld_map)) /* Already done in rtld itself. */
+# endif
+ *reloc_addr += map->l_addr;
+ }
+# ifndef RTLD_BOOTSTRAP
+ else if (__glibc_unlikely (r_type == R_386_NONE))
+ return;
+# endif
+ else
+# endif /* !RTLD_BOOTSTRAP and have no -z combreloc */
+ {
+# ifndef RTLD_BOOTSTRAP
+ const Elf32_Sym *const refsym = sym;
+# endif
+ struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+ Elf32_Addr value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+ if (sym != NULL
+ && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
+ 0)
+ && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+ && __builtin_expect (!skip_ifunc, 1))
+ {
+# ifndef RTLD_BOOTSTRAP
+ if (sym_map != map
+ && sym_map->l_type != lt_executable
+ && !sym_map->l_relocated)
+ {
+ const char *strtab
+ = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+ _dl_error_printf ("\
+%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+ RTLD_PROGNAME, map->l_name,
+ sym_map->l_name,
+ strtab + refsym->st_name);
+ }
+# endif
+ value = ((Elf32_Addr (*) (void)) value) ();
+ }
+
+ switch (r_type)
+ {
+# ifndef RTLD_BOOTSTRAP
+ case R_386_SIZE32:
+ /* Set to symbol size plus addend. */
+ *reloc_addr += sym->st_size;
+ break;
+# endif
+ case R_386_GLOB_DAT:
+ case R_386_JMP_SLOT:
+ *reloc_addr = value;
+ break;
+
+ case R_386_TLS_DTPMOD32:
+# ifdef RTLD_BOOTSTRAP
+ /* During startup the dynamic linker is always the module
+ with index 1.
+ XXX If this relocation is necessary move before RESOLVE
+ call. */
+ *reloc_addr = 1;
+# else
+ /* Get the information from the link map returned by the
+ resolv function. */
+ if (sym_map != NULL)
+ *reloc_addr = sym_map->l_tls_modid;
+# endif
+ break;
+ case R_386_TLS_DTPOFF32:
+# ifndef RTLD_BOOTSTRAP
+ /* During relocation all TLS symbols are defined and used.
+ Therefore the offset is already correct. */
+ if (sym != NULL)
+ *reloc_addr = sym->st_value;
+# endif
+ break;
+ case R_386_TLS_DESC:
+ {
+ struct tlsdesc volatile *td =
+ (struct tlsdesc volatile *)reloc_addr;
+
+# ifndef RTLD_BOOTSTRAP
+ if (! sym)
+ td->entry = _dl_tlsdesc_undefweak;
+ else
+# endif
+ {
+# ifndef RTLD_BOOTSTRAP
+# ifndef SHARED
+ CHECK_STATIC_TLS (map, sym_map);
+# else
+ if (!TRY_STATIC_TLS (map, sym_map))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic
+ (sym_map, sym->st_value + (ElfW(Word))td->arg);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+# endif
+# endif
+ {
+ td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+ + (ElfW(Word))td->arg);
+ td->entry = _dl_tlsdesc_return;
+ }
+ }
+ break;
+ }
+ case R_386_TLS_TPOFF32:
+ /* The offset is positive, backward from the thread pointer. */
+# ifdef RTLD_BOOTSTRAP
+ *reloc_addr += map->l_tls_offset - sym->st_value;
+# else
+ /* We know the offset of object the symbol is contained in.
+ It is a positive value which will be subtracted from the
+ thread pointer. To get the variable position in the TLS
+ block we subtract the offset from that of the TLS block. */
+ if (sym != NULL)
+ {
+ CHECK_STATIC_TLS (map, sym_map);
+ *reloc_addr += sym_map->l_tls_offset - sym->st_value;
+ }
+# endif
+ break;
+ case R_386_TLS_TPOFF:
+ /* The offset is negative, forward from the thread pointer. */
+# ifdef RTLD_BOOTSTRAP
+ *reloc_addr += sym->st_value - map->l_tls_offset;
+# else
+ /* We know the offset of object the symbol is contained in.
+ It is a negative value which will be added to the
+ thread pointer. */
+ if (sym != NULL)
+ {
+ CHECK_STATIC_TLS (map, sym_map);
+ *reloc_addr += sym->st_value - sym_map->l_tls_offset;
+ }
+# endif
+ break;
+
+# ifndef RTLD_BOOTSTRAP
+ case R_386_32:
+ *reloc_addr += value;
+ break;
+ case R_386_PC32:
+ *reloc_addr += (value - (Elf32_Addr) reloc_addr);
+ break;
+ case R_386_COPY:
+ if (sym == NULL)
+ /* This can happen in trace mode if an object could not be
+ found. */
+ break;
+ if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+ || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+ && GLRO(dl_verbose)))
+ {
+ const char *strtab;
+
+ strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+ _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+ RTLD_PROGNAME, strtab + refsym->st_name);
+ }
+ memcpy (reloc_addr_arg, (void *) value,
+ MIN (sym->st_size, refsym->st_size));
+ break;
+ case R_386_IRELATIVE:
+ value = map->l_addr + *reloc_addr;
+ value = ((Elf32_Addr (*) (void)) value) ();
+ *reloc_addr = value;
+ break;
+ default:
+ _dl_reloc_bad_type (map, r_type, 0);
+ break;
+# endif /* !RTLD_BOOTSTRAP */
+ }
+ }
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
+ const Elf32_Sym *sym, const struct r_found_version *version,
+ void *const reloc_addr_arg, int skip_ifunc)
+{
+ Elf32_Addr *const reloc_addr = reloc_addr_arg;
+ const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+
+ if (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE)
+ *reloc_addr = map->l_addr + reloc->r_addend;
+ else if (r_type != R_386_NONE)
+ {
+# ifndef RESOLVE_CONFLICT_FIND_MAP
+ const Elf32_Sym *const refsym = sym;
+# endif
+ struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
+ Elf32_Addr value = sym == NULL ? 0 : sym_map->l_addr + sym->st_value;
+
+ if (sym != NULL
+ && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+ && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+ && __builtin_expect (!skip_ifunc, 1))
+ value = ((Elf32_Addr (*) (void)) value) ();
+
+ switch (ELF32_R_TYPE (reloc->r_info))
+ {
+ case R_386_SIZE32:
+ /* Set to symbol size plus addend. */
+ value = sym->st_size;
+ case R_386_GLOB_DAT:
+ case R_386_JMP_SLOT:
+ case R_386_32:
+ *reloc_addr = value + reloc->r_addend;
+ break;
+# ifndef RESOLVE_CONFLICT_FIND_MAP
+ /* Not needed for dl-conflict.c. */
+ case R_386_PC32:
+ *reloc_addr = (value + reloc->r_addend - (Elf32_Addr) reloc_addr);
+ break;
+
+ case R_386_TLS_DTPMOD32:
+ /* Get the information from the link map returned by the
+ resolv function. */
+ if (sym_map != NULL)
+ *reloc_addr = sym_map->l_tls_modid;
+ break;
+ case R_386_TLS_DTPOFF32:
+ /* During relocation all TLS symbols are defined and used.
+ Therefore the offset is already correct. */
+ *reloc_addr = (sym == NULL ? 0 : sym->st_value) + reloc->r_addend;
+ break;
+ case R_386_TLS_DESC:
+ {
+ struct tlsdesc volatile *td =
+ (struct tlsdesc volatile *)reloc_addr;
+
+# ifndef RTLD_BOOTSTRAP
+ if (!sym)
+ {
+ td->arg = (void*)reloc->r_addend;
+ td->entry = _dl_tlsdesc_undefweak;
+ }
+ else
+# endif
+ {
+# ifndef RTLD_BOOTSTRAP
+# ifndef SHARED
+ CHECK_STATIC_TLS (map, sym_map);
+# else
+ if (!TRY_STATIC_TLS (map, sym_map))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic
+ (sym_map, sym->st_value + reloc->r_addend);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+# endif
+# endif
+ {
+ td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+ + reloc->r_addend);
+ td->entry = _dl_tlsdesc_return;
+ }
+ }
+ }
+ break;
+ case R_386_TLS_TPOFF32:
+ /* The offset is positive, backward from the thread pointer. */
+ /* We know the offset of object the symbol is contained in.
+ It is a positive value which will be subtracted from the
+ thread pointer. To get the variable position in the TLS
+ block we subtract the offset from that of the TLS block. */
+ if (sym != NULL)
+ {
+ CHECK_STATIC_TLS (map, sym_map);
+ *reloc_addr = sym_map->l_tls_offset - sym->st_value
+ + reloc->r_addend;
+ }
+ break;
+ case R_386_TLS_TPOFF:
+ /* The offset is negative, forward from the thread pointer. */
+ /* We know the offset of object the symbol is contained in.
+ It is a negative value which will be added to the
+ thread pointer. */
+ if (sym != NULL)
+ {
+ CHECK_STATIC_TLS (map, sym_map);
+ *reloc_addr = sym->st_value - sym_map->l_tls_offset
+ + reloc->r_addend;
+ }
+ break;
+ case R_386_COPY:
+ if (sym == NULL)
+ /* This can happen in trace mode if an object could not be
+ found. */
+ break;
+ if (__builtin_expect (sym->st_size > refsym->st_size, 0)
+ || (__builtin_expect (sym->st_size < refsym->st_size, 0)
+ && GLRO(dl_verbose)))
+ {
+ const char *strtab;
+
+ strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+ _dl_error_printf ("\
+%s: Symbol `%s' has different size in shared object, consider re-linking\n",
+ RTLD_PROGNAME, strtab + refsym->st_name);
+ }
+ memcpy (reloc_addr_arg, (void *) value,
+ MIN (sym->st_size, refsym->st_size));
+ break;
+# endif /* !RESOLVE_CONFLICT_FIND_MAP */
+ case R_386_IRELATIVE:
+ value = map->l_addr + reloc->r_addend;
+ value = ((Elf32_Addr (*) (void)) value) ();
+ *reloc_addr = value;
+ break;
+ default:
+ /* We add these checks in the version to relocate ld.so only
+ if we are still debugging. */
+ _dl_reloc_bad_type (map, r_type, 0);
+ break;
+ }
+ }
+}
+# endif /* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute ((always_inline))
+elf_machine_rel_relative (Elf32_Addr l_addr, const Elf32_Rel *reloc,
+ void *const reloc_addr_arg)
+{
+ Elf32_Addr *const reloc_addr = reloc_addr_arg;
+ assert (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE);
+ *reloc_addr += l_addr;
+}
+
+# ifndef RTLD_BOOTSTRAP
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
+ void *const reloc_addr_arg)
+{
+ Elf32_Addr *const reloc_addr = reloc_addr_arg;
+ *reloc_addr = l_addr + reloc->r_addend;
+}
+# endif /* !RTLD_BOOTSTRAP */
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rel (struct link_map *map,
+ Elf32_Addr l_addr, const Elf32_Rel *reloc,
+ int skip_ifunc)
+{
+ Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+ const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+ /* Check for unexpected PLT reloc type. */
+ if (__glibc_likely (r_type == R_386_JMP_SLOT))
+ {
+ if (__builtin_expect (map->l_mach.plt, 0) == 0)
+ *reloc_addr += l_addr;
+ else
+ *reloc_addr = (map->l_mach.plt
+ + (((Elf32_Addr) reloc_addr) - map->l_mach.gotplt) * 4);
+ }
+ else if (__glibc_likely (r_type == R_386_TLS_DESC))
+ {
+ struct tlsdesc volatile * __attribute__((__unused__)) td =
+ (struct tlsdesc volatile *)reloc_addr;
+
+ /* Handle relocations that reference the local *ABS* in a simple
+ way, so as to preserve a potential addend. */
+ if (ELF32_R_SYM (reloc->r_info) == 0)
+ td->entry = _dl_tlsdesc_resolve_abs_plus_addend;
+ /* Given a known-zero addend, we can store a pointer to the
+ reloc in the arg position. */
+ else if (td->arg == 0)
+ {
+ td->arg = (void*)reloc;
+ td->entry = _dl_tlsdesc_resolve_rel;
+ }
+ else
+ {
+ /* We could handle non-*ABS* relocations with non-zero addends
+ by allocating dynamically an arg to hold a pointer to the
+ reloc, but that sounds pointless. */
+ const Elf32_Rel *const r = reloc;
+ /* The code below was borrowed from elf_dynamic_do_rel(). */
+ const ElfW(Sym) *const symtab =
+ (const void *) D_PTR (map, l_info[DT_SYMTAB]);
+
+# ifdef RTLD_BOOTSTRAP
+ /* The dynamic linker always uses versioning. */
+ assert (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL);
+# else
+ if (map->l_info[VERSYMIDX (DT_VERSYM)])
+# endif
+ {
+ const ElfW(Half) *const version =
+ (const void *) D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+ ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff;
+ elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)],
+ &map->l_versions[ndx],
+ (void *) (l_addr + r->r_offset), skip_ifunc);
+ }
+# ifndef RTLD_BOOTSTRAP
+ else
+ elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL,
+ (void *) (l_addr + r->r_offset), skip_ifunc);
+# endif
+ }
+ }
+ else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+ {
+ Elf32_Addr value = map->l_addr + *reloc_addr;
+ if (__glibc_likely (!skip_ifunc))
+ value = ((Elf32_Addr (*) (void)) value) ();
+ *reloc_addr = value;
+ }
+ else
+ _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# ifndef RTLD_BOOTSTRAP
+
+auto inline void
+__attribute__ ((always_inline))
+elf_machine_lazy_rela (struct link_map *map,
+ Elf32_Addr l_addr, const Elf32_Rela *reloc,
+ int skip_ifunc)
+{
+ Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
+ const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
+ if (__glibc_likely (r_type == R_386_JMP_SLOT))
+ ;
+ else if (__glibc_likely (r_type == R_386_TLS_DESC))
+ {
+ struct tlsdesc volatile * __attribute__((__unused__)) td =
+ (struct tlsdesc volatile *)reloc_addr;
+
+ td->arg = (void*)reloc;
+ td->entry = _dl_tlsdesc_resolve_rela;
+ }
+ else if (__glibc_unlikely (r_type == R_386_IRELATIVE))
+ {
+ Elf32_Addr value = map->l_addr + reloc->r_addend;
+ if (__glibc_likely (!skip_ifunc))
+ value = ((Elf32_Addr (*) (void)) value) ();
+ *reloc_addr = value;
+ }
+ else
+ _dl_reloc_bad_type (map, r_type, 1);
+}
+
+# endif /* !RTLD_BOOTSTRAP */
+
+#endif /* RESOLVE_MAP */
diff --git a/REORG.TODO/sysdeps/i386/dl-procinfo.c b/REORG.TODO/sysdeps/i386/dl-procinfo.c
new file mode 100644
index 0000000000..7237f778b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-procinfo.c
@@ -0,0 +1,65 @@
+/* Data for i386 version of processor capability information.
+ Copyright (C) 2001-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* If anything should be added here check whether the size of each string
+ is still ok with the given array size.
+
+ All the #ifdefs in the definitions are quite irritating but
+ necessary if we want to avoid duplicating the information. There
+ are three different modes:
+
+ - PROCINFO_DECL is defined. This means we are only interested in
+ declarations.
+
+ - PROCINFO_DECL is not defined:
+
+ + if SHARED is defined the file is included in an array
+ initializer. The .element = { ... } syntax is needed.
+
+ + if SHARED is not defined a normal array initialization is
+ needed.
+ */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#include <sysdeps/x86/dl-procinfo.c>
+
+#if !defined PROCINFO_DECL && defined SHARED
+ ._dl_x86_cap_flags
+#else
+PROCINFO_CLASS const char _dl_x86_cap_flags[32][8]
+#endif
+#ifndef PROCINFO_DECL
+= {
+ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+ "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov",
+ "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx",
+ "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe"
+ }
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/REORG.TODO/sysdeps/i386/dl-tls.h b/REORG.TODO/sysdeps/i386/dl-tls.h
new file mode 100644
index 0000000000..525ebab992
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tls.h
@@ -0,0 +1,61 @@
+/* Thread-local storage handling in the ELF dynamic linker. i386 version.
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+/* Type used for the representation of TLS information in the GOT. */
+typedef struct dl_tls_index
+{
+ unsigned long int ti_module;
+ unsigned long int ti_offset;
+} tls_index;
+
+
+#ifdef SHARED
+/* This is the prototype for the GNU version. */
+extern void *___tls_get_addr (tls_index *ti)
+ __attribute__ ((__regparm__ (1)));
+extern void *___tls_get_addr_internal (tls_index *ti)
+ __attribute__ ((__regparm__ (1))) attribute_hidden;
+
+# if IS_IN (rtld)
+/* The special thing about the x86 TLS ABI is that we have two
+ variants of the __tls_get_addr function with different calling
+ conventions. The GNU version, which we are mostly concerned here,
+ takes the parameter in a register. The name is changed by adding
+ an additional underscore at the beginning. The Sun version uses
+ the normal calling convention. */
+void *
+__tls_get_addr (tls_index *ti)
+{
+ return ___tls_get_addr_internal (ti);
+}
+
+
+/* Prepare using the definition of __tls_get_addr in the generic
+ version of this file. */
+# define __tls_get_addr __attribute__ ((__regparm__ (1))) ___tls_get_addr
+strong_alias (___tls_get_addr, ___tls_get_addr_internal)
+rtld_hidden_proto (___tls_get_addr)
+rtld_hidden_def (___tls_get_addr)
+#else
+
+/* Users should get the better interface. */
+# define __tls_get_addr ___tls_get_addr
+
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.S b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
new file mode 100644
index 0000000000..8befdc2b39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S
@@ -0,0 +1,285 @@
+/* Thread-local storage handling in the ELF dynamic linker. i386 version.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+ .text
+
+ /* This function is used to compute the TP offset for symbols in
+ Static TLS, i.e., whose TP offset is the same for all
+ threads.
+
+ The incoming %eax points to the TLS descriptor, such that
+ 0(%eax) points to _dl_tlsdesc_return itself, and 4(%eax) holds
+ the TP offset of the symbol corresponding to the object
+ denoted by the argument. */
+
+ .hidden _dl_tlsdesc_return
+ .global _dl_tlsdesc_return
+ .type _dl_tlsdesc_return,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_return:
+ movl 4(%eax), %eax
+ ret
+ cfi_endproc
+ .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+ /* This function is used for undefined weak TLS symbols, for
+ which the base address (i.e., disregarding any addend) should
+ resolve to NULL.
+
+ %eax points to the TLS descriptor, such that 0(%eax) points to
+ _dl_tlsdesc_undefweak itself, and 4(%eax) holds the addend.
+ We return the addend minus the TP, such that, when the caller
+ adds TP, it gets the addend back. If that's zero, as usual,
+ that's most likely a NULL pointer. */
+
+ .hidden _dl_tlsdesc_undefweak
+ .global _dl_tlsdesc_undefweak
+ .type _dl_tlsdesc_undefweak,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_undefweak:
+ movl 4(%eax), %eax
+ subl %gs:0, %eax
+ ret
+ cfi_endproc
+ .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef SHARED
+ .hidden _dl_tlsdesc_dynamic
+ .global _dl_tlsdesc_dynamic
+ .type _dl_tlsdesc_dynamic,@function
+
+ /* This function is used for symbols that need dynamic TLS.
+
+ %eax points to the TLS descriptor, such that 0(%eax) points to
+ _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+ tlsdesc_dynamic_arg object. It must return in %eax the offset
+ between the thread pointer and the object denoted by the
+ argument, without clobbering any registers.
+
+ The assembly code that follows is a rendition of the following
+ C code, hand-optimized a little bit.
+
+ptrdiff_t
+__attribute__ ((__regparm__ (1)))
+_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+{
+ struct tlsdesc_dynamic_arg *td = tdp->arg;
+ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+ if (__builtin_expect (td->gen_count <= dtv[0].counter
+ && (dtv[td->tlsinfo.ti_module].pointer.val
+ != TLS_DTV_UNALLOCATED),
+ 1))
+ return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+ - __thread_pointer;
+
+ return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+}
+*/
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_dynamic:
+ /* Like all TLS resolvers, preserve call-clobbered registers.
+ We need two scratch regs anyway. */
+ subl $28, %esp
+ cfi_adjust_cfa_offset (28)
+ movl %ecx, 20(%esp)
+ movl %edx, 24(%esp)
+ movl TLSDESC_ARG(%eax), %eax
+ movl %gs:DTV_OFFSET, %edx
+ movl TLSDESC_GEN_COUNT(%eax), %ecx
+ cmpl (%edx), %ecx
+ ja .Lslow
+ movl TLSDESC_MODID(%eax), %ecx
+ movl (%edx,%ecx,8), %edx
+ cmpl $-1, %edx
+ je .Lslow
+ movl TLSDESC_MODOFF(%eax), %eax
+ addl %edx, %eax
+.Lret:
+ movl 20(%esp), %ecx
+ subl %gs:0, %eax
+ movl 24(%esp), %edx
+ addl $28, %esp
+ cfi_adjust_cfa_offset (-28)
+ ret
+ .p2align 4,,7
+.Lslow:
+ cfi_adjust_cfa_offset (28)
+ call HIDDEN_JUMPTARGET (___tls_get_addr)
+ jmp .Lret
+ cfi_endproc
+ .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* SHARED */
+
+ /* This function is a wrapper for a lazy resolver for TLS_DESC
+ REL relocations that reference the *ABS* segment in their own
+ link maps. %ebx points to the caller's GOT. %eax points to a
+ TLS descriptor, such that 0(%eax) holds the address of the
+ resolver wrapper itself (unless some other thread beat us to
+ it) and 4(%eax) holds the addend in the relocation.
+
+ When the actual resolver returns, it will have adjusted the
+ TLS descriptor such that we can tail-call it for it to return
+ the TP offset of the symbol. */
+
+ .hidden _dl_tlsdesc_resolve_abs_plus_addend
+ .global _dl_tlsdesc_resolve_abs_plus_addend
+ .type _dl_tlsdesc_resolve_abs_plus_addend,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_resolve_abs_plus_addend:
+0:
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl $1f - 0b, %ecx
+ movl 4(%ebx), %edx
+ call _dl_tlsdesc_resolve_abs_plus_addend_fixup
+1:
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ popl %ecx
+ cfi_adjust_cfa_offset (-4)
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ jmp *(%eax)
+ cfi_endproc
+ .size _dl_tlsdesc_resolve_abs_plus_addend, .-_dl_tlsdesc_resolve_abs_plus_addend
+
+ /* This function is a wrapper for a lazy resolver for TLS_DESC
+ REL relocations that had zero addends. %ebx points to the
+ caller's GOT. %eax points to a TLS descriptor, such that
+ 0(%eax) holds the address of the resolver wrapper itself
+ (unless some other thread beat us to it) and 4(%eax) holds a
+ pointer to the relocation.
+
+ When the actual resolver returns, it will have adjusted the
+ TLS descriptor such that we can tail-call it for it to return
+ the TP offset of the symbol. */
+
+ .hidden _dl_tlsdesc_resolve_rel
+ .global _dl_tlsdesc_resolve_rel
+ .type _dl_tlsdesc_resolve_rel,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_resolve_rel:
+0:
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl $1f - 0b, %ecx
+ movl 4(%ebx), %edx
+ call _dl_tlsdesc_resolve_rel_fixup
+1:
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ popl %ecx
+ cfi_adjust_cfa_offset (-4)
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ jmp *(%eax)
+ cfi_endproc
+ .size _dl_tlsdesc_resolve_rel, .-_dl_tlsdesc_resolve_rel
+
+ /* This function is a wrapper for a lazy resolver for TLS_DESC
+ RELA relocations. %ebx points to the caller's GOT. %eax
+ points to a TLS descriptor, such that 0(%eax) holds the
+ address of the resolver wrapper itself (unless some other
+ thread beat us to it) and 4(%eax) holds a pointer to the
+ relocation.
+
+ When the actual resolver returns, it will have adjusted the
+ TLS descriptor such that we can tail-call it for it to return
+ the TP offset of the symbol. */
+
+ .hidden _dl_tlsdesc_resolve_rela
+ .global _dl_tlsdesc_resolve_rela
+ .type _dl_tlsdesc_resolve_rela,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_resolve_rela:
+0:
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl $1f - 0b, %ecx
+ movl 4(%ebx), %edx
+ call _dl_tlsdesc_resolve_rela_fixup
+1:
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ popl %ecx
+ cfi_adjust_cfa_offset (-4)
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ jmp *(%eax)
+ cfi_endproc
+ .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
+
+ /* This function is a placeholder for lazy resolving of TLS
+ relocations. Once some thread starts resolving a TLS
+ relocation, it sets up the TLS descriptor to use this
+ resolver, such that other threads that would attempt to
+ resolve it concurrently may skip the call to the original lazy
+ resolver and go straight to a condition wait.
+
+ When the actual resolver returns, it will have adjusted the
+ TLS descriptor such that we can tail-call it for it to return
+ the TP offset of the symbol. */
+
+ .hidden _dl_tlsdesc_resolve_hold
+ .global _dl_tlsdesc_resolve_hold
+ .type _dl_tlsdesc_resolve_hold,@function
+ cfi_startproc
+ .align 16
+_dl_tlsdesc_resolve_hold:
+0:
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl $1f - 0b, %ecx
+ movl 4(%ebx), %edx
+ call _dl_tlsdesc_resolve_hold_fixup
+1:
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ popl %ecx
+ cfi_adjust_cfa_offset (-4)
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ jmp *(%eax)
+ cfi_endproc
+ .size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.h b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
new file mode 100644
index 0000000000..242bebfc8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h
@@ -0,0 +1,61 @@
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+ i386 version.
+ Copyright (C) 2005-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _I386_DL_TLSDESC_H
+# define _I386_DL_TLSDESC_H 1
+
+/* Type used to represent a TLS descriptor in the GOT. */
+struct tlsdesc
+{
+ ptrdiff_t __attribute__ ((regparm (1))) (*entry) (struct tlsdesc *);
+ void *arg;
+};
+
+typedef struct dl_tls_index
+{
+ unsigned long int ti_module;
+ unsigned long int ti_offset;
+} tls_index;
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+ needs dynamic TLS offsets. */
+struct tlsdesc_dynamic_arg
+{
+ tls_index tlsinfo;
+ size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+ _dl_tlsdesc_return (struct tlsdesc *),
+ _dl_tlsdesc_undefweak (struct tlsdesc *),
+ _dl_tlsdesc_resolve_abs_plus_addend (struct tlsdesc *),
+ _dl_tlsdesc_resolve_rel (struct tlsdesc *),
+ _dl_tlsdesc_resolve_rela (struct tlsdesc *),
+ _dl_tlsdesc_resolve_hold (struct tlsdesc *);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
+ size_t ti_offset)
+ internal_function attribute_hidden;
+
+extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1)))
+ _dl_tlsdesc_dynamic (struct tlsdesc *);
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/dl-trampoline.S b/REORG.TODO/sysdeps/i386/dl-trampoline.S
new file mode 100644
index 0000000000..6e7f3aef92
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/dl-trampoline.S
@@ -0,0 +1,215 @@
+/* PLT trampolines. i386 version.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <link-defines.h>
+
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
+ .text
+ .globl _dl_runtime_resolve
+ .type _dl_runtime_resolve, @function
+ cfi_startproc
+ .align 16
+_dl_runtime_resolve:
+ cfi_adjust_cfa_offset (8)
+ pushl %eax # Preserve registers otherwise clobbered.
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl 16(%esp), %edx # Copy args pushed by PLT in register. Note
+ movl 12(%esp), %eax # that `fixup' takes its parameters in regs.
+ call _dl_fixup # Call resolver.
+ popl %edx # Get register content back.
+ cfi_adjust_cfa_offset (-4)
+ movl (%esp), %ecx
+ movl %eax, (%esp) # Store the function address.
+ movl 4(%esp), %eax
+ ret $12 # Jump to function address.
+ cfi_endproc
+ .size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+ .globl _dl_runtime_profile
+ .type _dl_runtime_profile, @function
+ cfi_startproc
+ .align 16
+_dl_runtime_profile:
+ cfi_adjust_cfa_offset (8)
+ pushl %esp
+ cfi_adjust_cfa_offset (4)
+ addl $8, (%esp) # Account for the pushed PLT data
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %eax # Preserve registers otherwise clobbered.
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+ movl %esp, %ecx
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ movl $-1, 4(%esp)
+ leal 4(%esp), %edx
+ movl %edx, (%esp)
+ pushl %ecx # Address of the register structure
+ cfi_adjust_cfa_offset (4)
+ movl 40(%esp), %ecx # Load return address
+ movl 36(%esp), %edx # Copy args pushed by PLT in register. Note
+ movl 32(%esp), %eax # that `fixup' takes its parameters in regs.
+ call _dl_profile_fixup # Call resolver.
+ cfi_adjust_cfa_offset (-8)
+ movl (%esp), %edx
+ testl %edx, %edx
+ jns 1f
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ popl %edx # Get register content back.
+ cfi_adjust_cfa_offset (-4)
+ movl (%esp), %ecx
+ movl %eax, (%esp) # Store the function address.
+ movl 4(%esp), %eax
+ ret $20 # Jump to function address.
+
+ /*
+ +32 return address
+ +28 PLT1
+ +24 PLT2
+ +20 %esp
+ +16 %ebp
+ +12 %eax
+ +8 %ecx
+ +4 %edx
+ %esp free
+ */
+ cfi_adjust_cfa_offset (8)
+1: movl %ebx, (%esp)
+ cfi_rel_offset (ebx, 0)
+ movl %edx, %ebx # This is the frame buffer size
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (esi, 0)
+ leal 44(%esp), %esi
+ movl %ebx, %ecx
+ orl $4, %ebx # Increase frame size if necessary to align
+ # stack for the function call
+ andl $~3, %ebx
+ movl %esp, %edi
+ subl %ebx, %edi
+ movl %esp, %ebx
+ cfi_def_cfa_register (ebx)
+ movl %edi, %esp
+ shrl $2, %ecx
+ rep
+ movsl
+ movl (%ebx), %esi
+ cfi_restore (esi)
+ movl 4(%ebx), %edi
+ cfi_restore (edi)
+ /*
+ %ebx+40 return address
+ %ebx+36 PLT1
+ %ebx+32 PLT2
+ %ebx+28 %esp
+ %ebx+24 %ebp
+ %ebx+20 %eax
+ %ebx+16 %ecx
+ %ebx+12 %edx
+ %ebx+8 %ebx
+ %ebx+4 free
+ %ebx free
+ %esp copied stack frame
+ */
+ movl %eax, (%ebx)
+ movl 12(%ebx), %edx
+ movl 16(%ebx), %ecx
+ movl 20(%ebx), %eax
+ call *(%ebx)
+ movl %ebx, %esp
+ cfi_def_cfa_register (esp)
+ movl 8(%esp), %ebx
+ cfi_restore (ebx)
+ /*
+ +40 return address
+ +36 PLT1
+ +32 PLT2
+ +28 %esp
+ +24 %ebp
+ +20 %eax
+ +16 %ecx
+ +12 %edx
+ +8 free
+ +4 free
+ %esp free
+ */
+#if LONG_DOUBLE_SIZE != 12
+# error "long double size must be 12 bytes"
+#endif
+ # Allocate space for La_i86_retval and subtract 12 free bytes.
+ subl $(LRV_SIZE - 12), %esp
+ cfi_adjust_cfa_offset (LRV_SIZE - 12)
+ movl %eax, LRV_EAX_OFFSET(%esp)
+ movl %edx, LRV_EDX_OFFSET(%esp)
+ fstpt LRV_ST0_OFFSET(%esp)
+ fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+ bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+ bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+ .byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
+ .byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+ pushl %esp
+ cfi_adjust_cfa_offset (4)
+ # Address of La_i86_regs area.
+ leal (LRV_SIZE + 4)(%esp), %ecx
+ # PLT2
+ movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax
+ # PLT1
+ movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx
+ call _dl_call_pltexit
+ movl LRV_EAX_OFFSET(%esp), %eax
+ movl LRV_EDX_OFFSET(%esp), %edx
+ fldt LRV_ST1_OFFSET(%esp)
+ fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+ bndmov LRV_BND0_OFFSET(%esp), %bnd0
+ bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+ .byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
+ .byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
+ # Restore stack before return.
+ addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
+ cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+ PRESERVE_BND_REGS_PREFIX
+ ret
+ cfi_endproc
+ .size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
diff --git a/REORG.TODO/sysdeps/i386/ffs.c b/REORG.TODO/sysdeps/i386/ffs.c
new file mode 100644
index 0000000000..c229c8166e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ffs.c
@@ -0,0 +1,50 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+ For Intel 80x86, x>=3.
+ This file is part of the GNU C Library.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ Contributed by Torbjorn Granlund (tege@sics.se).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef ffs
+
+#ifdef __GNUC__
+
+int
+__ffs (int x)
+{
+ int cnt;
+ int tmp;
+
+ asm ("xorl %0,%0\n" /* Set CNT to zero. */
+ "bsfl %2,%1\n" /* Count low bits in X and store in %1. */
+ "jz 1f\n" /* Jump if OK, i.e. X was non-zero. */
+ "leal 1(%1),%0\n" /* Return bsfl-result plus one on %0. */
+ "1:" : "=&a" (cnt), "=r" (tmp) : "rm" (x));
+
+ return cnt;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/Implies b/REORG.TODO/sysdeps/i386/fpu/Implies
new file mode 100644
index 0000000000..2b745a34fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Implies
@@ -0,0 +1 @@
+x86/fpu
diff --git a/REORG.TODO/sysdeps/i386/fpu/Versions b/REORG.TODO/sysdeps/i386/fpu/Versions
new file mode 100644
index 0000000000..a2eec371f1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/Versions
@@ -0,0 +1,6 @@
+libm {
+ GLIBC_2.2 {
+ # functions used in inline functions or macros
+ __expl; __expm1l;
+ }
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/doasin.c b/REORG.TODO/sysdeps/i386/fpu/doasin.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/doasin.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acos.S b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
new file mode 100644
index 0000000000..586c7fc406
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acos.S
@@ -0,0 +1,25 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: e_acos.S,v 1.4 1995/05/08 23:44:37 jtc Exp $")
+
+/* acos = atan (sqrt((1-x) (1+x)) / x) */
+ENTRY(__ieee754_acos)
+ fldl 4(%esp) /* x */
+ fld %st /* x : x */
+ fld1 /* 1 : x : x */
+ fsubp /* 1 - x : x */
+ fld1 /* 1 : 1 - x : x */
+ fadd %st(2) /* 1 + x : 1 - x : x */
+ fmulp /* 1 - x^2 : x */
+ fsqrt /* sqrt (1 - x^2) : x */
+ fabs
+ fxch %st(1) /* x : sqrt (1 - x^2) */
+ fpatan /* atan (sqrt(1 - x^2) / x) */
+ ret
+END (__ieee754_acos)
+strong_alias (__ieee754_acos, __acos_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosf.S b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
new file mode 100644
index 0000000000..54930af8b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+/* acos = atan (sqrt(1 - x^2) / x) */
+ENTRY(__ieee754_acosf)
+ flds 4(%esp) /* x */
+ fld %st
+ fmul %st(0) /* x^2 */
+ fld1
+ fsubp /* 1 - x^2 */
+ fsqrt /* sqrt (1 - x^2) */
+ fabs
+ fxch %st(1)
+ fpatan
+ ret
+END (__ieee754_acosf)
+strong_alias (__ieee754_acosf, __acosf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosh.S b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
new file mode 100644
index 0000000000..9555ef8078
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_acosh)
+ movl 8(%esp), %ecx
+ cmpl $0x3ff00000, %ecx
+ jl 5f // < 1 => invalid
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ cmpl $0x41b00000, %ecx
+ ja 3f // x > 2^28
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x40000000, %ecx
+ ja 4f // x > 2
+
+ // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ fsubl MO(one) // x-1 : log(2)
+ fabs // acosh(1) is +0 in all rounding modes
+ fld %st // x-1 : x-1 : log(2)
+ fmul %st(1) // (x-1)^2 : x-1 : log(2)
+ fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2)
+ fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2)
+ fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+ faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 2f
+ fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+ // x > 2^28 => y = log(x) + log(2)
+ .align ALIGNARG(4)
+3: fyl2x // log(x)
+ fldln2 // log(2) : log(x)
+ faddp // log(x)+log(2)
+ ret
+
+ // 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+ .align ALIGNARG(4)
+4: fld %st // x : x : log(2)
+ fadd %st, %st(1) // x : 2*x : log(2)
+ fld %st // x : x : 2*x : log(2)
+ fmul %st(1) // x^2 : x : 2*x : log(2)
+ fsubl MO(one) // x^2-1 : x : 2*x : log(2)
+ fsqrt // sqrt(x^2-1) : x : 2*x : log(2)
+ faddp // x+sqrt(x^2-1) : 2*x : log(2)
+ fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+ fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2)
+ fyl2x // log(2*x+1/(x+sqrt(x^2-1)))
+ ret
+
+ // x < 1 (or -NaN) => NaN
+ .align ALIGNARG(4)
+5: fldl 4(%esp)
+ fsub %st
+ fdiv %st, %st(0)
+ ret
+END(__ieee754_acosh)
+strong_alias (__ieee754_acosh, __acosh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
new file mode 100644
index 0000000000..662fda3c06
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S
@@ -0,0 +1,101 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_acoshf)
+ movl 4(%esp), %ecx
+ cmpl $0x3f800000, %ecx
+ jl 5f // < 1 => invalid
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ cmpl $0x47000000, %ecx
+ ja 3f // x > 2^14
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x40000000, %ecx
+ ja 4f // x > 2
+
+ // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ fsubl MO(one) // x-1 : log(2)
+ fabs // acosh(1) is +0 in all rounding modes
+ fld %st // x-1 : x-1 : log(2)
+ fmul %st(1) // (x-1)^2 : x-1 : log(2)
+ fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2)
+ fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2)
+ fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+ faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 2f
+ fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+ // x > 2^14 => y = log(x) + log(2)
+ .align ALIGNARG(4)
+3: fyl2x // log(x)
+ fldln2 // log(2) : log(x)
+ faddp // log(x)+log(2)
+ ret
+
+ // 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+ .align ALIGNARG(4)
+4: fld %st // x : x : log(2)
+ fadd %st, %st(1) // x : 2*x : log(2)
+ fld %st // x : x : 2*x : log(2)
+ fmul %st(1) // x^2 : x : 2*x : log(2)
+ fsubl MO(one) // x^2-1 : x : 2*x : log(2)
+ fsqrt // sqrt(x^2-1) : x : 2*x : log(2)
+ faddp // x+sqrt(x^2-1) : 2*x : log(2)
+ fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+ fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2)
+ fyl2x // log(2*x+1/(x+sqrt(x^2-1)))
+ ret
+
+ // x < 1 (or -NaN) => NaN
+ .align ALIGNARG(4)
+5: flds 4(%esp)
+ fsub %st
+ fdiv %st, %st(0)
+ ret
+END(__ieee754_acoshf)
+strong_alias (__ieee754_acoshf, __acoshf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
new file mode 100644
index 0000000000..e0d6466aac
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S
@@ -0,0 +1,107 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ /* Please note that we use double value for 1.0. This number
+ has an exact representation and so we don't get accuracy
+ problems. The advantage is that the code is simpler. */
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_acoshl)
+ movl 12(%esp), %ecx
+ andl $0xffff, %ecx
+ cmpl $0x3fff, %ecx
+ jl 5f // < 1 => invalid
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+ cmpl $0x4020, %ecx
+ ja 3f // x > 2^34
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x4000, %ecx
+ ja 4f // x > 2
+
+ // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ fsubl MO(one) // x-1 : log(2)
+ fabs // acosh(1) is +0 in all rounding modes
+ fld %st // x-1 : x-1 : log(2)
+ fmul %st(1) // (x-1)^2 : x-1 : log(2)
+ fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2)
+ fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2)
+ fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2)
+ faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 2f
+ fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2)
+ fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2))
+ ret
+
+ // x > 2^34 => y = log(x) + log(2)
+ .align ALIGNARG(4)
+3: fyl2x // log(x)
+ fldln2 // log(2) : log(x)
+ faddp // log(x)+log(2)
+ ret
+
+ // 2^34 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1)))
+ .align ALIGNARG(4)
+4: fld %st // x : x : log(2)
+ fadd %st, %st(1) // x : 2*x : log(2)
+ fld %st // x : x : 2*x : log(2)
+ fmul %st(1) // x^2 : x : 2*x : log(2)
+ fsubl MO(one) // x^2-1 : x : 2*x : log(2)
+ fsqrt // sqrt(x^2-1) : x : 2*x : log(2)
+ faddp // x+sqrt(x^2-1) : 2*x : log(2)
+ fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2)
+ fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2)
+ fyl2x // log(2*x+1/(x+sqrt(x^2-1)))
+ ret
+
+ // x < 1 => NaN
+ .align ALIGNARG(4)
+5: fldz
+ fdiv %st, %st(0)
+ ret
+END(__ieee754_acoshl)
+strong_alias (__ieee754_acoshl, __acoshl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosl.c b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
new file mode 100644
index 0000000000..ab08931924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_acosl (long double x)
+{
+ long double res;
+
+ /* acosl = atanl (sqrtl((1-x) (1+x)) / x) */
+ asm ( "fld %%st\n"
+ "fld1\n"
+ "fsubp\n"
+ "fld1\n"
+ "fadd %%st(2)\n"
+ "fmulp\n" /* 1 - x^2 */
+ "fsqrt\n" /* sqrtl (1 - x^2) */
+ "fabs\n"
+ "fxch %%st(1)\n"
+ "fpatan"
+ : "=t" (res) : "0" (x) : "st(1)");
+ return res;
+}
+strong_alias (__ieee754_acosl, __acosl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asin.S b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
new file mode 100644
index 0000000000..39c8b47da4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asin.S
@@ -0,0 +1,38 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_asin.S,v 1.4 1995/05/08 23:45:40 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+
+/* asin = atan (x / sqrt((1-x) (1+x))) */
+ENTRY(__ieee754_asin)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp) /* x */
+ fld %st
+ fld1 /* 1 : x : x */
+ fsubp /* 1 - x : x */
+ fld1 /* 1 : 1 - x : x */
+ fadd %st(2) /* 1 + x : 1 - x : x */
+ fmulp /* 1 - x^2 */
+ fsqrt /* sqrt (1 - x^2) */
+ fpatan
+ DBL_CHECK_FORCE_UFLOW
+ ret
+END (__ieee754_asin)
+strong_alias (__ieee754_asin, __asin_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asinf.S b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
new file mode 100644
index 0000000000..1102bdedfd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S
@@ -0,0 +1,39 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: $")
+
+ .section .rodata.cst4,"aM",@progbits,4
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+
+/* asin = atan (x / sqrt(1 - x^2)) */
+ENTRY(__ieee754_asinf)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp) /* x */
+ fld %st
+ fmul %st(0) /* x^2 */
+ fld1
+ fsubp /* 1 - x^2 */
+ fsqrt /* sqrt (1 - x^2) */
+ fpatan
+ FLT_CHECK_FORCE_UFLOW
+ ret
+END (__ieee754_asinf)
+strong_alias (__ieee754_asinf, __asinf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
new file mode 100644
index 0000000000..25f43bb5a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2.S,v 1.4 1995/05/08 23:46:28 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_atan2)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp)
+ fldl 12(%esp)
+ fpatan
+ DBL_CHECK_FORCE_UFLOW_NARROW
+ ret
+END (__ieee754_atan2)
+strong_alias (__ieee754_atan2, __atan2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
new file mode 100644
index 0000000000..2bc909a762
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: e_atan2f.S,v 1.1 1995/05/08 23:35:10 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_atan2f)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp)
+ flds 8(%esp)
+ fpatan
+ FLT_CHECK_FORCE_UFLOW_NARROW
+ ret
+END (__ieee754_atan2f)
+strong_alias (__ieee754_atan2f, __atan2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
new file mode 100644
index 0000000000..9f88bfcc08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_atan2l (long double y, long double x)
+{
+ long double res;
+
+ asm ("fpatan" : "=t" (res) : "u" (y), "0" (x) : "st(1)");
+
+ return res;
+}
+strong_alias (__ieee754_atan2l, __atan2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanh.S b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
new file mode 100644
index 0000000000..cbc93d5da2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S
@@ -0,0 +1,112 @@
+/* ix87 specific implementation of arctanh function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type half,@object
+half: .double 0.5
+ ASM_SIZE_DIRECTIVE(half)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ .type ln2_2,@object
+ln2_2: .tfloat 0.3465735902799726547086160
+ ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_atanh)
+ movl 8(%esp), %ecx
+
+ movl %ecx, %eax
+ andl $0x7fffffff, %eax
+ cmpl $0x7ff00000, %eax
+ jae 5f
+7:
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ andl $0x80000000, %ecx // ECX == 0 iff X >= 0
+
+ fldt MO(ln2_2) // 0.5*ln2
+ xorl %ecx, 8(%esp)
+ fldl 4(%esp) // |x| : 0.5*ln2
+ fcoml MO(half) // |x| : 0.5*ln2
+ fld %st // |x| : |x| : 0.5*ln2
+ fnstsw // |x| : |x| : 0.5*ln2
+ sahf
+ jae 2f
+ fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2
+ fld %st // |x| : |x| : 2*|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2
+ fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2
+ fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+ fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+ faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ sahf
+ jae 4f
+ fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ DBL_CHECK_FORCE_UFLOW_NONNEG
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2
+ fxch // |x| : 1+|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2
+ fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld((1+x)/(1-x))
+3: ret
+
+ // x == NaN or ±Inf
+5: ja 6f
+ cmpl $0, 4(%esp)
+ je 7b
+6: fldl 4(%esp)
+ ret
+END(__ieee754_atanh)
+strong_alias (__ieee754_atanh, __atanh_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
new file mode 100644
index 0000000000..92fda3fd82
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S
@@ -0,0 +1,109 @@
+/* ix87 specific implementation of arctanh function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type half,@object
+half: .double 0.5
+ ASM_SIZE_DIRECTIVE(half)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ .align ALIGNARG(4)
+ .type ln2_2,@object
+ln2_2: .tfloat 0.3465735902799726547086160
+ ASM_SIZE_DIRECTIVE(ln2_2)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_atanhf)
+ movl 4(%esp), %ecx
+
+ movl %ecx, %eax
+ andl $0x7fffffff, %eax
+ cmpl $0x7f800000, %eax
+ ja 5f
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ andl $0x80000000, %ecx // ECX == 0 iff X >= 0
+
+ fldt MO(ln2_2) // 0.5*ln2
+ xorl %ecx, 4(%esp)
+ flds 4(%esp) // |x| : 0.5*ln2
+ fcoml MO(half) // |x| : 0.5*ln2
+ fld %st(0) // |x| : |x| : 0.5*ln2
+ fnstsw // |x| : |x| : 0.5*ln2
+ sahf
+ jae 2f
+ fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2
+ fld %st // |x| : |x| : 2*|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2
+ fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2
+ fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+ fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+ faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ sahf
+ jae 4f
+ fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ FLT_CHECK_FORCE_UFLOW_NONNEG
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2
+ fxch // |x| : 1+|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2
+ fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld((1+x)/(1-x))
+3: ret
+
+ // x == NaN
+5: flds 4(%esp)
+ ret
+END(__ieee754_atanhf)
+strong_alias (__ieee754_atanhf, __atanhf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
new file mode 100644
index 0000000000..31ff7e5182
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S
@@ -0,0 +1,127 @@
+/* ix87 specific implementation of arctanh function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ /* Please note that we use double values for 0.5 and 1.0. These
+ numbers have exact representations and so we don't get accuracy
+ problems. The advantage is that the code is simpler. */
+ .type half,@object
+half: .double 0.5
+ ASM_SIZE_DIRECTIVE(half)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ .align ALIGNARG(4)
+ .type ln2_2,@object
+ln2_2: .tfloat 0.3465735902799726547086160
+ ASM_SIZE_DIRECTIVE(ln2_2)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_atanhl)
+ movl 12(%esp), %ecx
+
+ movl %ecx, %eax
+ andl $0x7fff, %eax
+ cmpl $0x7fff, %eax
+ je 5f
+ cmpl $0x3fdf, %eax
+ jge 7f
+ // Exponent below -32; return x, with underflow if subnormal.
+ fldt 4(%esp)
+ cmpl $0, %eax
+ jne 8f
+ fld %st(0)
+ fmul %st(0)
+ fstp %st(0)
+8: ret
+7:
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ andl $0x8000, %ecx // ECX == 0 iff X >= 0
+
+ fldt MO(ln2_2) // 0.5*ln2
+ xorl %ecx, 12(%esp)
+ fldt 4(%esp) // |x| : 0.5*ln2
+ fcoml MO(half) // |x| : 0.5*ln2
+ fld %st(0) // |x| : |x| : 0.5*ln2
+ fnstsw // |x| : |x| : 0.5*ln2
+ sahf
+ jae 2f
+ fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2
+ fld %st // |x| : |x| : 2*|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2
+ fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2
+ fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2
+ fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2
+ faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ sahf
+ jae 4f
+ fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x))
+3: ret
+
+ .align ALIGNARG(4)
+2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2
+ fxch // |x| : 1+|x| : 0.5*ln2
+ fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2
+ fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2
+ fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|))
+ jecxz 3f
+ fchs // 0.5*ln2*ld((1+x)/(1-x))
+3: ret
+
+ // x == NaN or ±Inf
+5: cmpl $0x80000000, 8(%esp)
+ ja 6f
+ cmpl $0, 4(%esp)
+ je 7b
+6: fldt 4(%esp)
+ fadd %st(0)
+ ret
+END(__ieee754_atanhl)
+strong_alias (__ieee754_atanhl, __atanhl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp.S b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
new file mode 100644
index 0000000000..a7e7f13f6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp.S
@@ -0,0 +1,73 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_exp)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fldl2e
+ fmulp /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp)
+
+
+ENTRY(__exp_finite)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl2e
+ fmull 4(%esp) /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ DBL_NARROW_EVAL_UFLOW_NONNEG
+ ret
+END(__exp_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
new file mode 100644
index 0000000000..acb5160a3f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fldl2t
+ fmulp /* x * log2(10) */
+ fld %st
+ frndint /* int(x * log2(10)) */
+ fsubr %st,%st(1) /* fract(x * log2(10)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(10))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(10))) */
+ fscale /* e^x */
+ fstp %st(1)
+ DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp10)
+strong_alias (__ieee754_exp10, __exp10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
new file mode 100644
index 0000000000..1812b34398
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S
@@ -0,0 +1,53 @@
+/*
+ * Written by Ulrich Drepper.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+/* 10^x = 2^(x * log2(10)) */
+ENTRY(__ieee754_exp10f)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fldl2t
+ fmulp /* x * log2(10) */
+ fld %st
+ frndint /* int(x * log2(10)) */
+ fsubr %st,%st(1) /* fract(x * log2(10)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(10))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(10))) */
+ fscale /* e^x */
+ fstp %st(1)
+ FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp10f)
+strong_alias (__ieee754_exp10f, __exp10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
new file mode 100644
index 0000000000..d843e2b5e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXP10L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
new file mode 100644
index 0000000000..fc16a96053
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_exp2)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fld %st
+ frndint /* int(x) */
+ fsubr %st,%st(1) /* fract(x) */
+ fxch
+ f2xm1 /* 2^(fract(x)) - 1 */
+ fld1
+ faddp /* 2^(fract(x)) */
+ fscale /* e^x */
+ fstp %st(1)
+ DBL_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp2)
+strong_alias (__ieee754_exp2, __exp2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
new file mode 100644
index 0000000000..30623cd850
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S
@@ -0,0 +1,52 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_exp2f)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fld %st
+ frndint /* int(x) */
+ fsubr %st,%st(1) /* fract(x) */
+ fxch
+ f2xm1 /* 2^(fract(x)) - 1 */
+ fld1
+ faddp /* 2^(fract(x)) */
+ fscale /* e^x */
+ fstp %st(1)
+ FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp2f)
+strong_alias (__ieee754_exp2f, __exp2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
new file mode 100644
index 0000000000..c4cb73d589
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S
@@ -0,0 +1,60 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_exp2l)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldt 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ movzwl 4+8(%esp), %eax
+ andl $0x7fff, %eax
+ cmpl $0x3fbe, %eax
+ jge 3f
+ /* Argument's exponent below -65, result rounds to 1. */
+ fld1
+ faddp
+ ret
+3: fld %st
+ frndint /* int(x) */
+ fsubr %st,%st(1) /* fract(x) */
+ fxch
+ f2xm1 /* 2^(fract(x)) - 1 */
+ fld1
+ faddp /* 2^(fract(x)) */
+ fscale /* e^x */
+ fstp %st(1)
+ LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expf.S b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
new file mode 100644
index 0000000000..65cb4ec204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expf.S
@@ -0,0 +1,74 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+/* e^x = 2^(x * log2(e)) */
+ENTRY(__ieee754_expf)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp)
+/* I added the following ugly construct because exp(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ fldl2e
+ fmulp /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ FLT_NARROW_EVAL_UFLOW_NONNEG_NAN
+ ret
+
+1: testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+2: ret
+END (__ieee754_expf)
+
+
+ENTRY(__expf_finite)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl2e
+ fmuls 4(%esp) /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ FLT_NARROW_EVAL_UFLOW_NONNEG
+ ret
+END(__expf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expl.S b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
new file mode 100644
index 0000000000..7d75fe22a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_expl.S
@@ -0,0 +1,226 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+/*
+ * The 8087 method for the exponential function is to calculate
+ * exp(x) = 2^(x log2(e))
+ * after separating integer and fractional parts
+ * x log2(e) = i + f, |f| <= .5
+ * 2^i is immediate but f needs to be precise for long double accuracy.
+ * Suppress range reduction error in computing f by the following.
+ * Separate x into integer and fractional parts
+ * x = xi + xf, |xf| <= .5
+ * Separate log2(e) into the sum of an exact number c0 and small part c1.
+ * c0 + c1 = log2(e) to extra precision
+ * Then
+ * f = (c0 xi - i) + c0 xf + c1 x
+ * where c0 xi is exact and so also is (c0 xi - i).
+ * -- moshier@na-net.ornl.gov
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+#ifdef USE_AS_EXP10L
+# define IEEE754_EXPL __ieee754_exp10l
+# define EXPL_FINITE __exp10l_finite
+# define FLDLOG fldl2t
+#elif defined USE_AS_EXPM1L
+# define IEEE754_EXPL __expm1l
+# undef EXPL_FINITE
+# define FLDLOG fldl2e
+#else
+# define IEEE754_EXPL __ieee754_expl
+# define EXPL_FINITE __expl_finite
+# define FLDLOG fldl2e
+#endif
+
+ .section .rodata.cst16,"aM",@progbits,16
+
+ .p2align 4
+#ifdef USE_AS_EXP10L
+ .type c0,@object
+c0: .byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40
+ .byte 0, 0, 0, 0, 0, 0
+ ASM_SIZE_DIRECTIVE(c0)
+ .type c1,@object
+c1: .byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f
+ .byte 0, 0, 0, 0, 0, 0
+ ASM_SIZE_DIRECTIVE(c1)
+#else
+ .type c0,@object
+c0: .byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f
+ .byte 0, 0, 0, 0, 0, 0
+ ASM_SIZE_DIRECTIVE(c0)
+ .type c1,@object
+c1: .byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f
+ .byte 0, 0, 0, 0, 0, 0
+ ASM_SIZE_DIRECTIVE(c1)
+#endif
+#ifndef USE_AS_EXPM1L
+ .type csat,@object
+csat: .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40
+ .byte 0, 0, 0, 0, 0, 0
+ ASM_SIZE_DIRECTIVE(csat)
+DEFINE_LDBL_MIN
+#endif
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+ movzwl 4+8(%esp), %eax
+ xorb $0x80, %ah // invert sign bit (now 1 is "positive")
+ cmpl $0xc006, %eax // is num positive and exp >= 6 (number is >= 128.0)?
+ jae HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0)
+#endif
+ fldt 4(%esp)
+/* I added the following ugly construct because expl(+-Inf) resulted
+ in NaN. The ugliness results from the bright minds at Intel.
+ For the i686 the code can be written better.
+ -- drepper@cygnus.com. */
+ fxam /* Is NaN or +-Inf? */
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+#ifdef USE_AS_EXPM1L
+ xorb $0x80, %ah
+ cmpl $0xc006, %eax
+ fstsw %ax
+ movb $0x45, %dh
+ jb 4f
+
+ /* Below -64.0 (may be -NaN or -Inf). */
+ andb %ah, %dh
+ cmpb $0x01, %dh
+ je 6f /* Is +-NaN, jump. */
+ jmp 1f /* -large, possibly -Inf. */
+
+4: /* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf). */
+ /* Test for +-0 as argument. */
+ andb %ah, %dh
+ cmpb $0x40, %dh
+ je 2f
+
+ /* Test for arguments that are small but not subnormal. */
+ movzwl 4+8(%esp), %eax
+ andl $0x7fff, %eax
+ cmpl $0x3fbf, %eax
+ jge 3f
+ /* Argument's exponent below -64; avoid spurious underflow if
+ normal. */
+ cmpl $0x0001, %eax
+ jge 2f
+ /* Force underflow and return the argument, to avoid wrong signs
+ of zero results from the code below in some rounding modes. */
+ fld %st
+ fmul %st
+ fstp %st
+ jmp 2f
+#else
+ movzwl 4+8(%esp), %eax
+ andl $0x7fff, %eax
+ cmpl $0x400d, %eax
+ jg 5f
+ cmpl $0x3fbc, %eax
+ jge 3f
+ /* Argument's exponent below -67, result rounds to 1. */
+ fld1
+ faddp
+ jmp 2f
+5: /* Overflow, underflow or infinity or NaN as argument. */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ cmpb $0x01, %dh
+ je 6f /* Is +-NaN, jump. */
+ /* Overflow or underflow; saturate. */
+ fstp %st
+ fldt MO(csat)
+ andb $2, %ah
+ jz 3f
+ fchs
+#endif
+3: FLDLOG /* 1 log2(base) */
+ fmul %st(1), %st /* 1 x log2(base) */
+ /* Set round-to-nearest temporarily. */
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fstcw 4(%esp)
+ movl $0xf3ff, %edx
+ andl 4(%esp), %edx
+ movl %edx, (%esp)
+ fldcw (%esp)
+ frndint /* 1 i */
+ fld %st(1) /* 2 x */
+ frndint /* 2 xi */
+ fldcw 4(%esp)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fld %st(1) /* 3 i */
+ fldt MO(c0) /* 4 c0 */
+ fld %st(2) /* 5 xi */
+ fmul %st(1), %st /* 5 c0 xi */
+ fsubp %st, %st(2) /* 4 f = c0 xi - i */
+ fld %st(4) /* 5 x */
+ fsub %st(3), %st /* 5 xf = x - xi */
+ fmulp %st, %st(1) /* 4 c0 xf */
+ faddp %st, %st(1) /* 3 f = f + c0 xf */
+ fldt MO(c1) /* 4 */
+ fmul %st(4), %st /* 4 c1 * x */
+ faddp %st, %st(1) /* 3 f = f + c1 * x */
+ f2xm1 /* 3 2^(fract(x * log2(base))) - 1 */
+#ifdef USE_AS_EXPM1L
+ fstp %st(1) /* 2 */
+ fscale /* 2 scale factor is st(1); base^x - 2^i */
+ fxch /* 2 i */
+ fld1 /* 3 1.0 */
+ fscale /* 3 2^i */
+ fld1 /* 4 1.0 */
+ fsubrp %st, %st(1) /* 3 2^i - 1.0 */
+ fstp %st(1) /* 2 */
+ faddp %st, %st(1) /* 1 base^x - 1.0 */
+#else
+ fld1 /* 4 1.0 */
+ faddp /* 3 2^(fract(x * log2(base))) */
+ fstp %st(1) /* 2 */
+ fscale /* 2 scale factor is st(1); base^x */
+ fstp %st(1) /* 1 */
+ LDBL_CHECK_FORCE_UFLOW_NONNEG
+#endif
+ fstp %st(1) /* 0 */
+ jmp 2f
+1:
+#ifdef USE_AS_EXPM1L
+ /* For expm1l, only negative sign gets here. */
+ fstp %st
+ fld1
+ fchs
+#else
+ testl $0x200, %eax /* Test sign. */
+ jz 2f /* If positive, jump. */
+ fstp %st
+ fldz /* Set result to 0. */
+#endif
+2: ret
+6: /* NaN argument. */
+ fadd %st
+ ret
+END(IEEE754_EXPL)
+#ifdef USE_AS_EXPM1L
+libm_hidden_def (__expm1l)
+weak_alias (__expm1l, expm1l)
+#else
+strong_alias (IEEE754_EXPL, EXPL_FINITE)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmod.S b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
new file mode 100644
index 0000000000..26b3acc392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmod)
+ fldl 12(%esp)
+ fldl 4(%esp)
+1: fprem
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ ret
+END (__ieee754_fmod)
+strong_alias (__ieee754_fmod, __fmod_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
new file mode 100644
index 0000000000..ece4d98427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_fmodf)
+ flds 8(%esp)
+ flds 4(%esp)
+1: fprem
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ ret
+END(__ieee754_fmodf)
+strong_alias (__ieee754_fmodf, __fmodf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
new file mode 100644
index 0000000000..49700ae8f6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__ieee754_fmodl (long double x, long double y)
+{
+ long double res;
+
+ asm ("1:\tfprem\n"
+ "fstsw %%ax\n"
+ "sahf\n"
+ "jp 1b\n"
+ "fstp %%st(1)"
+ : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)");
+ return res;
+}
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypot.S b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
new file mode 100644
index 0000000000..7403566fd7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S
@@ -0,0 +1,75 @@
+/* Compute the hypothenuse of X and Y.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_hypot)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fldl 4(%esp) // x
+ fxam
+ fnstsw
+ fldl 12(%esp) // y : x
+ movb %ah, %ch
+ fxam
+ fnstsw
+ movb %ah, %al
+ orb %ch, %ah
+ sahf
+ jc 1f
+ fmul %st(0) // y * y : x
+ fxch // x : y * y
+ fmul %st(0) // x * x : y * y
+ faddp // x * x + y * y
+ fsqrt
+ DBL_NARROW_EVAL_UFLOW_NONNEG
+2: ret
+
+ // We have to test whether any of the parameters is Inf.
+ // In this case the result is infinity.
+1: andb $0x45, %al
+ cmpb $5, %al
+ je 3f // jump if y is Inf
+ andb $0x45, %ch
+ cmpb $5, %ch
+ jne 4f // jump if x is not Inf
+ fxch
+3: fstp %st(1)
+ fabs
+ jmp 2b
+
+4: testb $1, %al
+ jnz 5f // y is NaN
+ fxch
+5: fstp %st(1)
+ jmp 2b
+
+END(__ieee754_hypot)
+strong_alias (__ieee754_hypot, __hypot_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
new file mode 100644
index 0000000000..6a2c7052b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S
@@ -0,0 +1,64 @@
+/* Compute the hypothenuse of X and Y.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <i386-math-asm.h>
+
+ .text
+ENTRY(__ieee754_hypotf)
+ flds 4(%esp) // x
+ fxam
+ fnstsw
+ flds 8(%esp) // y : x
+ movb %ah, %ch
+ fxam
+ fnstsw
+ movb %ah, %al
+ orb %ch, %ah
+ sahf
+ jc 1f
+ fmul %st(0) // y * y : x
+ fxch // x : y * y
+ fmul %st(0) // x * x : y * y
+ faddp // x * x + y * y
+ fsqrt
+ FLT_NARROW_EVAL
+2: ret
+
+ // We have to test whether any of the parameters is Inf.
+ // In this case the result is infinity.
+1: andb $0x45, %al
+ cmpb $5, %al
+ je 3f // jump if y is Inf
+ andb $0x45, %ch
+ cmpb $5, %ch
+ jne 4f // jump if x is not Inf
+ fxch
+3: fstp %st(1)
+ fabs
+ jmp 2b
+
+4: testb $1, %al
+ jnz 5f // y is NaN
+ fxch
+5: fstp %st(1)
+ jmp 2b
+
+END(__ieee754_hypotf)
+strong_alias (__ieee754_hypotf, __hypotf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
new file mode 100644
index 0000000000..29ef2214e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogb.S,v 1.5 1995/10/12 15:53:09 jtc Exp $")
+
+ENTRY(__ieee754_ilogb)
+ fldl 4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+ required to return INT_MAX in ISO C99.
+ -- jakub@redhat.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ cmpb $0x40, %dh
+ je 2f /* Is +-0, jump. */
+
+ fxtract
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fstp %st
+
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+
+ ret
+
+1: fstp %st
+ movl $0x7fffffff, %eax
+ ret
+2: fstp %st
+ movl $0x80000000, %eax /* FP_ILOGB0 */
+ ret
+END (__ieee754_ilogb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
new file mode 100644
index 0000000000..d72de6c84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S
@@ -0,0 +1,42 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ilogbf.S,v 1.4 1995/10/22 20:32:43 pk Exp $")
+
+ENTRY(__ieee754_ilogbf)
+ flds 4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+ required to return INT_MAX in ISO C99.
+ -- jakub@redhat.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ cmpb $0x40, %dh
+ je 2f /* Is +-0, jump. */
+
+ fxtract
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fstp %st
+
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+
+ ret
+
+1: fstp %st
+ movl $0x7fffffff, %eax
+ ret
+2: fstp %st
+ movl $0x80000000, %eax /* FP_ILOGB0 */
+ ret
+END (__ieee754_ilogbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
new file mode 100644
index 0000000000..60761dfa38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S
@@ -0,0 +1,43 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ieee754_ilogbl)
+ fldt 4(%esp)
+/* I added the following ugly construct because ilogb(+-Inf) is
+ required to return INT_MAX in ISO C99.
+ -- jakub@redhat.com. */
+ fxam /* Is NaN or +-Inf? */
+ fstsw %ax
+ movb $0x45, %dh
+ andb %ah, %dh
+ cmpb $0x05, %dh
+ je 1f /* Is +-Inf, jump. */
+ cmpb $0x40, %dh
+ je 2f /* Is +-0, jump. */
+
+ fxtract
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fstp %st
+
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+
+ ret
+
+1: fstp %st
+ movl $0x7fffffff, %eax
+ ret
+2: fstp %st
+ movl $0x80000000, %eax /* FP_ILOGB0 */
+ ret
+END (__ieee754_ilogbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log.S b/REORG.TODO/sysdeps/i386/fpu/e_log.S
new file mode 100644
index 0000000000..335df22577
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log.S
@@ -0,0 +1,92 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fxam
+ fnstsw
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ sahf
+ jc 3f // in case x is NaN or +-Inf
+4: fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is +-Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 6f
+ fabs // log(1) is +0 in all rounding modes.
+6: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10.S b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
new file mode 100644
index 0000000000..17277084ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10.S
@@ -0,0 +1,68 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log10)
+ fldlg2 // log10(2)
+ fldl 4(%esp) // x : log10(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fxam
+ fnstsw
+ fld %st // x : x : log10(2)
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsubl MO(one) // x-1 : x : log10(2)
+ fld %st // x-1 : x-1 : x : log10(2)
+ fabs // |x-1| : x-1 : x : log10(2)
+ fcompl MO(limit) // x-1 : x : log10(2)
+ fnstsw // x-1 : x : log10(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log10(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log10(2)
+ fyl2xp1 // log10(x)
+ ret
+
+2: fstp %st(0) // x : log10(2)
+ fyl2x // log10(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_log10)
+strong_alias (__ieee754_log10, __log10_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10f.S b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
new file mode 100644
index 0000000000..72a3b88251
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log10f)
+ fldlg2 // log10(2)
+ flds 4(%esp) // x : log10(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fxam
+ fnstsw
+ fld %st // x : x : log10(2)
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsubl MO(one) // x-1 : x : log10(2)
+ fld %st // x-1 : x-1 : x : log10(2)
+ fabs // |x-1| : x-1 : x : log10(2)
+ fcompl MO(limit) // x-1 : x : log10(2)
+ fnstsw // x-1 : x : log10(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log10(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log10(2)
+ fyl2xp1 // log10(x)
+ ret
+
+2: fstp %st(0) // x : log10(2)
+ fyl2x // log10(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_log10f)
+strong_alias (__ieee754_log10f, __log10f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10l.S b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
new file mode 100644
index 0000000000..9326b19796
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S
@@ -0,0 +1,71 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log10l)
+ fldlg2 // log10(2)
+ fldt 4(%esp) // x : log10(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fxam
+ fnstsw
+ fld %st // x : x : log10(2)
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsubl MO(one) // x-1 : x : log10(2)
+ fld %st // x-1 : x-1 : x : log10(2)
+ fabs // |x-1| : x-1 : x : log10(2)
+ fcompl MO(limit) // x-1 : x : log10(2)
+ fnstsw // x-1 : x : log10(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log10(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log10(2)
+ fyl2xp1 // log10(x)
+ ret
+
+2: fstp %st(0) // x : log10(2)
+ fyl2x // log10(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ fadd %st(0)
+ ret
+END(__ieee754_log10l)
+strong_alias (__ieee754_log10l, __log10l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2.S b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
new file mode 100644
index 0000000000..73ff0fffd3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fldl MO(one)
+ fldl 4(%esp) // x : 1
+ fxam
+ fnstsw
+ fld %st // x : x : 1
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsub %st(2), %st // x-1 : x : 1
+ fld %st // x-1 : x-1 : x : 1
+ fabs // |x-1| : x-1 : x : 1
+ fcompl MO(limit) // x-1 : x : 1
+ fnstsw // x-1 : x : 1
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log2(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : 1
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : 1
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_log2)
+strong_alias (__ieee754_log2, __log2_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2f.S b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
new file mode 100644
index 0000000000..344eeb495e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S
@@ -0,0 +1,69 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log2f)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fldl MO(one)
+ flds 4(%esp) // x : 1
+ fxam
+ fnstsw
+ fld %st // x : x : 1
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsub %st(2), %st // x-1 : x : 1
+ fld %st // x-1 : x-1 : x : 1
+ fabs // |x-1| : x-1 : x : 1
+ fcompl MO(limit) // x-1 : x : 1
+ fnstsw // x-1 : x : 1
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log2(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : 1
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : 1
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_log2f)
+strong_alias (__ieee754_log2f, __log2f_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2l.S b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
new file mode 100644
index 0000000000..73e62ea908
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S
@@ -0,0 +1,70 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>.
+ * Public domain.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_log2l)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fldl MO(one)
+ fldt 4(%esp) // x : 1
+ fxam
+ fnstsw
+ fld %st // x : x : 1
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fsub %st(2), %st // x-1 : x : 1
+ fld %st // x-1 : x-1 : x : 1
+ fabs // |x-1| : x-1 : x : 1
+ fcompl MO(limit) // x-1 : x : 1
+ fnstsw // x-1 : x : 1
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log2(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : 1
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : 1
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ fadd %st(0)
+ ret
+END (__ieee754_log2l)
+strong_alias (__ieee754_log2l, __log2l_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
new file mode 100644
index 0000000000..de967a31f5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logf.S
@@ -0,0 +1,93 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_logf)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fxam
+ fnstsw
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ sahf
+ jc 3f // in case x is NaN or +-Inf
+4: fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is +-Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 6f
+ fabs // log(1) is +0 in all rounding modes.
+6: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
new file mode 100644
index 0000000000..53127d704e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_logl.S
@@ -0,0 +1,97 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_logl)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+ fxam
+ fnstsw
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ sahf
+ jc 3f // in case x is NaN or +-Inf
+ movzwl 4+8(%esp), %eax
+ cmpl $0xc000, %eax
+ jae 6f // x <= -2, avoid overflow from -LDBL_MAX - 1.
+4: fsubl MO(one) // x-1 : x : log(2)
+6: fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 5f
+ fabs // log(1) is +0 in all rounding modes.
+5: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: jp 4b // in case x is +-Inf
+ fstp %st(1)
+ fstp %st(1)
+ fadd %st(0)
+ ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 7f
+ fabs // log(1) is +0 in all rounding modes.
+7: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_pow.S b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
new file mode 100644
index 0000000000..2edb9a9fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_pow.S
@@ -0,0 +1,456 @@
+/* ix87 specific implementation of pow function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ .type p63,@object
+p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+ ASM_SIZE_DIRECTIVE(p63)
+ .type p10,@object
+p10: .byte 0, 0, 0, 0, 0, 0, 0x90, 0x40
+ ASM_SIZE_DIRECTIVE(p10)
+
+ .section .rodata.cst16,"aM",@progbits,16
+
+ .p2align 3
+ .type infinity,@object
+inf_zero:
+infinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+ ASM_SIZE_DIRECTIVE(infinity)
+ .type zero,@object
+zero: .double 0.0
+ ASM_SIZE_DIRECTIVE(zero)
+ .type minf_mzero,@object
+minf_mzero:
+minfinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+ .text
+ENTRY(__ieee754_pow)
+ fldl 12(%esp) // y
+ fxam
+
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+
+ fnstsw
+ movb %ah, %dl
+ andb $0x45, %ah
+ cmpb $0x40, %ah // is y == 0 ?
+ je 11f
+
+ cmpb $0x05, %ah // is y == ±inf ?
+ je 12f
+
+ cmpb $0x01, %ah // is y == NaN ?
+ je 30f
+
+ fldl 4(%esp) // x : y
+
+ subl $8,%esp
+ cfi_adjust_cfa_offset (8)
+
+ fxam
+ fnstsw
+ movb %ah, %dh
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ je 20f // x is ±0
+
+ cmpb $0x05, %ah
+ je 15f // x is ±inf
+
+ cmpb $0x01, %ah
+ je 32f // x is NaN
+
+ fxch // y : x
+
+ /* fistpll raises invalid exception for |y| >= 1L<<63. */
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p63) // y : x
+ fnstsw
+ sahf
+ jnc 2f
+
+ /* First see whether `y' is a natural number. In this case we
+ can use a more precise algorithm. */
+ fld %st // y : y : x
+ fistpll (%esp) // y : x
+ fildll (%esp) // int(y) : y : x
+ fucomp %st(1) // y : x
+ fnstsw
+ sahf
+ jne 3f
+
+ /* OK, we have an integer value for y. If large enough that
+ errors may propagate out of the 11 bits excess precision, use
+ the algorithm for real exponent instead. */
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p10) // y : x
+ fnstsw
+ sahf
+ jnc 2f
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ orl $0, %edx
+ fstp %st(0) // x
+ jns 4f // y >= 0, jump
+ fdivrl MO(one) // 1/x (now referred to as x)
+ negl %eax
+ adcl $0, %edx
+ negl %edx
+4: fldl MO(one) // 1 : x
+ fxch
+
+ /* If y is even, take the absolute value of x. Otherwise,
+ ensure all intermediate values that might overflow have the
+ sign of x. */
+ testb $1, %al
+ jnz 6f
+ fabs
+
+6: shrdl $1, %edx, %eax
+ jnc 5f
+ fxch
+ fabs
+ fmul %st(1) // x : ST*x
+ fxch
+5: fld %st // x : x : ST*x
+ fabs // |x| : x : ST*x
+ fmulp // |x|*x : ST*x
+ shrl $1, %edx
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jnz 6b
+ fstp %st(0) // ST*x
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ DBL_NARROW_EVAL_UFLOW_NONNAN
+ ret
+
+ /* y is ±NAN */
+30: fldl 4(%esp) // x : y
+ fldl MO(one) // 1.0 : x : y
+ fucomp %st(1) // x : y
+ fnstsw
+ sahf
+ je 31f
+ fxch // y : x
+31: fstp %st(1)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+32: addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fstp %st(1)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+2: // y is a large integer (absolute value at least 1L<<10), but
+ // may be odd unless at least 1L<<64. So it may be necessary
+ // to adjust the sign of a negative result afterwards.
+ fxch // x : y
+ fabs // |x| : y
+ fxch // y : x
+ .align ALIGNARG(4)
+3: /* y is a real number. */
+ fxch // x : y
+ fldl MO(one) // 1.0 : x : y
+ fldl MO(limit) // 0.29 : 1.0 : x : y
+ fld %st(2) // x : 0.29 : 1.0 : x : y
+ fsub %st(2) // x-1 : 0.29 : 1.0 : x : y
+ fabs // |x-1| : 0.29 : 1.0 : x : y
+ fucompp // 1.0 : x : y
+ fnstsw
+ fxch // x : 1.0 : y
+ sahf
+ ja 7f
+ fsub %st(1) // x-1 : 1.0 : y
+ fyl2xp1 // log2(x) : y
+ jmp 8f
+
+7: fyl2x // log2(x) : y
+8: fmul %st(1) // y*log2(x) : y
+ fst %st(1) // y*log2(x) : y*log2(x)
+ frndint // int(y*log2(x)) : y*log2(x)
+ fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x))
+ fxch // fract(y*log2(x)) : int(y*log2(x))
+ f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x))
+ faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x))
+
+ // Before scaling, we must negate if x is negative and y is an
+ // odd integer.
+ testb $2, %dh
+ jz 291f
+ // x is negative. If y is an odd integer, negate the result.
+ fldl 20(%esp) // y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fld %st // y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fabs // |y| : y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fcompl MO(p63) // y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fnstsw
+ sahf
+ jnc 290f
+
+ // We must find out whether y is an odd integer.
+ fld %st // y : y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fistpll (%esp) // y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fildll (%esp) // int(y) : y : 2^fract(y*log2(x)) : int(y*log2(x))
+ fucompp // 2^fract(y*log2(x)) : int(y*log2(x))
+ fnstsw
+ sahf
+ jne 291f
+
+ // OK, the value is an integer, but is it odd?
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 292f // jump if not odd
+ // It's an odd integer.
+ fchs
+ jmp 292f
+
+ cfi_adjust_cfa_offset (8)
+290: fstp %st(0) // 2^fract(y*log2(x)) : int(y*log2(x))
+291: addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+292: fscale // +/- 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+ fstp %st(1) // +/- 2^fract(y*log2(x))*2^int(y*log2(x))
+ DBL_NARROW_EVAL_UFLOW_NONNAN
+ ret
+
+
+ // pow(x,±0) = 1
+ .align ALIGNARG(4)
+11: fstp %st(0) // pop y
+ fldl MO(one)
+ ret
+
+ // y == ±inf
+ .align ALIGNARG(4)
+12: fstp %st(0) // pop y
+ fldl MO(one) // 1
+ fldl 4(%esp) // x : 1
+ fabs // abs(x) : 1
+ fucompp // < 1, == 1, or > 1
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x45, %ah
+ je 13f // jump if x is NaN
+
+ cmpb $0x40, %ah
+ je 14f // jump if |x| == 1
+
+ shlb $1, %ah
+ xorb %ah, %dl
+ andl $2, %edx
+ fldl MOX(inf_zero, %edx, 4)
+ ret
+
+ .align ALIGNARG(4)
+14: fldl MO(one)
+ ret
+
+ .align ALIGNARG(4)
+13: fldl 4(%esp) // load x == NaN
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±inf
+15: fstp %st(0) // y
+ testb $2, %dh
+ jz 16f // jump if x == +inf
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p63) // y
+ fnstsw
+ sahf
+ jnc 16f
+
+ // We must find out whether y is an odd integer.
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 17f
+
+ // OK, the value is an integer.
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 18f // jump if not odd
+ // It's an odd integer.
+ shrl $31, %edx
+ fldl MOX(minf_mzero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+16: fcompl MO(zero)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fnstsw
+ shrl $5, %eax
+ andl $8, %eax
+ fldl MOX(inf_zero, %eax, 1)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+17: shll $30, %edx // sign bit for y in right position
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+18: shrl $31, %edx
+ fldl MOX(inf_zero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±0
+20: fstp %st(0) // y
+ testb $2, %dl
+ jz 21f // y > 0
+
+ // x is ±0 and y is < 0. We must find out whether y is an odd integer.
+ testb $2, %dh
+ jz 25f
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p63) // y
+ fnstsw
+ sahf
+ jnc 25f
+
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 26f
+
+ // OK, the value is an integer.
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 27f // jump if not odd
+ // It's an odd integer.
+ // Raise divide-by-zero exception and get minus infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ fchs
+ ret
+
+ cfi_adjust_cfa_offset (8)
+25: fstp %st(0)
+26: addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+27: // Raise divide-by-zero exception and get infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±0 and y is > 0. We must find out whether y is an odd integer.
+21: testb $2, %dh
+ jz 22f
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fcoml MO(p63) // y
+ fnstsw
+ sahf
+ jnc 22f
+
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 23f
+
+ // OK, the value is an integer.
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 24f // jump if not odd
+ // It's an odd integer.
+ fldl MO(mzero)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+22: fstp %st(0)
+23: addl $8, %esp // Don't use 2 x pop
+ cfi_adjust_cfa_offset (-8)
+24: fldl MO(zero)
+ ret
+
+END(__ieee754_pow)
+strong_alias (__ieee754_pow, __pow_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powf.S b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
new file mode 100644
index 0000000000..467ef2380b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powf.S
@@ -0,0 +1,392 @@
+/* ix87 specific implementation of pow function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ .type p31,@object
+p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+ ASM_SIZE_DIRECTIVE(p31)
+
+ .section .rodata.cst16,"aM",@progbits,16
+
+ .p2align 3
+ .type infinity,@object
+inf_zero:
+infinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+ ASM_SIZE_DIRECTIVE(infinity)
+ .type zero,@object
+zero: .double 0.0
+ ASM_SIZE_DIRECTIVE(zero)
+ .type minf_mzero,@object
+minf_mzero:
+minfinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+ .text
+ENTRY(__ieee754_powf)
+ flds 8(%esp) // y
+ fxam
+
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+
+ fnstsw
+ movb %ah, %dl
+ andb $0x45, %ah
+ cmpb $0x40, %ah // is y == 0 ?
+ je 11f
+
+ cmpb $0x05, %ah // is y == ±inf ?
+ je 12f
+
+ cmpb $0x01, %ah // is y == NaN ?
+ je 30f
+
+ flds 4(%esp) // x : y
+
+ subl $4, %esp
+ cfi_adjust_cfa_offset (4)
+
+ fxam
+ fnstsw
+ movb %ah, %dh
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ je 20f // x is ±0
+
+ cmpb $0x05, %ah
+ je 15f // x is ±inf
+
+ cmpb $0x01, %ah
+ je 33f // x is NaN
+
+ fxch // y : x
+
+ /* fistpl raises invalid exception for |y| >= 1L<<31. */
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p31) // y : x
+ fnstsw
+ sahf
+ jnc 2f
+
+ /* First see whether `y' is a natural number. In this case we
+ can use a more precise algorithm. */
+ fld %st // y : y : x
+ fistpl (%esp) // y : x
+ fildl (%esp) // int(y) : y : x
+ fucomp %st(1) // y : x
+ fnstsw
+ sahf
+ jne 3f
+
+ /* OK, we have an integer value for y. */
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ orl $0, %edx
+ fstp %st(0) // x
+ jns 4f // y >= 0, jump
+ fdivrl MO(one) // 1/x (now referred to as x)
+ negl %edx
+4: fldl MO(one) // 1 : x
+ fxch
+
+ /* If y is even, take the absolute value of x. Otherwise,
+ ensure all intermediate values that might overflow have the
+ sign of x. */
+ testb $1, %dl
+ jnz 6f
+ fabs
+
+6: shrl $1, %edx
+ jnc 5f
+ fxch
+ fabs
+ fmul %st(1) // x : ST*x
+ fxch
+5: fld %st // x : x : ST*x
+ fabs // |x| : x : ST*x
+ fmulp // |x|*x : ST*x
+ testl %edx, %edx
+ jnz 6b
+ fstp %st(0) // ST*x
+ FLT_NARROW_EVAL_UFLOW_NONNAN
+ ret
+
+ /* y is ±NAN */
+30: flds 4(%esp) // x : y
+ fldl MO(one) // 1.0 : x : y
+ fucomp %st(1) // x : y
+ fnstsw
+ sahf
+ je 31f
+ fxch // y : x
+31: fstp %st(1)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+2: /* y is a large integer (so even). */
+ fxch // x : y
+ fabs // |x| : y
+ fxch // y : x
+ .align ALIGNARG(4)
+3: /* y is a real number. */
+ fxch // x : y
+ fldl MO(one) // 1.0 : x : y
+ fldl MO(limit) // 0.29 : 1.0 : x : y
+ fld %st(2) // x : 0.29 : 1.0 : x : y
+ fsub %st(2) // x-1 : 0.29 : 1.0 : x : y
+ fabs // |x-1| : 0.29 : 1.0 : x : y
+ fucompp // 1.0 : x : y
+ fnstsw
+ fxch // x : 1.0 : y
+ sahf
+ ja 7f
+ fsub %st(1) // x-1 : 1.0 : y
+ fyl2xp1 // log2(x) : y
+ jmp 8f
+
+7: fyl2x // log2(x) : y
+8: fmul %st(1) // y*log2(x) : y
+ fst %st(1) // y*log2(x) : y*log2(x)
+ frndint // int(y*log2(x)) : y*log2(x)
+ fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x))
+ fxch // fract(y*log2(x)) : int(y*log2(x))
+ f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x))
+ faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x))
+ fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
+32: addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+ fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x))
+ FLT_NARROW_EVAL_UFLOW_NONNAN
+ ret
+
+ /* x is NaN. */
+ cfi_adjust_cfa_offset (4)
+33: addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+ fstp %st(1)
+ ret
+
+ // pow(x,±0) = 1
+ .align ALIGNARG(4)
+11: fstp %st(0) // pop y
+ fldl MO(one)
+ ret
+
+ // y == ±inf
+ .align ALIGNARG(4)
+12: fstp %st(0) // pop y
+ fldl MO(one) // 1
+ flds 4(%esp) // x : 1
+ fabs // abs(x) : 1
+ fucompp // < 1, == 1, or > 1
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x45, %ah
+ je 13f // jump if x is NaN
+
+ cmpb $0x40, %ah
+ je 14f // jump if |x| == 1
+
+ shlb $1, %ah
+ xorb %ah, %dl
+ andl $2, %edx
+ fldl MOX(inf_zero, %edx, 4)
+ ret
+
+ .align ALIGNARG(4)
+14: fldl MO(one)
+ ret
+
+ .align ALIGNARG(4)
+13: flds 4(%esp) // load x == NaN
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+ // x is ±inf
+15: fstp %st(0) // y
+ testb $2, %dh
+ jz 16f // jump if x == +inf
+
+ // fistpl raises invalid exception for |y| >= 1L<<31, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p31) // y
+ fnstsw
+ sahf
+ jnc 16f
+
+ // We must find out whether y is an odd integer.
+ fld %st // y : y
+ fistpl (%esp) // y
+ fildl (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 17f
+
+ // OK, the value is an integer.
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ testb $1, %dl
+ jz 18f // jump if not odd
+ // It's an odd integer.
+ shrl $31, %edx
+ fldl MOX(minf_mzero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+16: fcompl MO(zero)
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+ fnstsw
+ shrl $5, %eax
+ andl $8, %eax
+ fldl MOX(inf_zero, %eax, 1)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+17: shll $30, %edx // sign bit for y in right position
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+18: shrl $31, %edx
+ fldl MOX(inf_zero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+ // x is ±0
+20: fstp %st(0) // y
+ testb $2, %dl
+ jz 21f // y > 0
+
+ // x is ±0 and y is < 0. We must find out whether y is an odd integer.
+ testb $2, %dh
+ jz 25f
+
+ // fistpl raises invalid exception for |y| >= 1L<<31, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p31) // y
+ fnstsw
+ sahf
+ jnc 25f
+
+ fld %st // y : y
+ fistpl (%esp) // y
+ fildl (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 26f
+
+ // OK, the value is an integer.
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ testb $1, %dl
+ jz 27f // jump if not odd
+ // It's an odd integer.
+ // Raise divide-by-zero exception and get minus infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ fchs
+ ret
+
+ cfi_adjust_cfa_offset (4)
+25: fstp %st(0)
+26: addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+27: // Raise divide-by-zero exception and get infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ .align ALIGNARG(4)
+ // x is ±0 and y is > 0. We must find out whether y is an odd integer.
+21: testb $2, %dh
+ jz 22f
+
+ // fistpl raises invalid exception for |y| >= 1L<<31, so test
+ // that (in which case y is certainly even) before testing
+ // whether y is odd.
+ fcoml MO(p31) // y
+ fnstsw
+ sahf
+ jnc 22f
+
+ fld %st // y : y
+ fistpl (%esp) // y
+ fildl (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 23f
+
+ // OK, the value is an integer.
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ testb $1, %dl
+ jz 24f // jump if not odd
+ // It's an odd integer.
+ fldl MO(mzero)
+ ret
+
+ cfi_adjust_cfa_offset (4)
+22: fstp %st(0)
+23: addl $4, %esp // Don't use pop.
+ cfi_adjust_cfa_offset (-4)
+24: fldl MO(zero)
+ ret
+
+END(__ieee754_powf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powl.S b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
new file mode 100644
index 0000000000..9e162848e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_powl.S
@@ -0,0 +1,459 @@
+/* ix87 specific implementation of pow function.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type p2,@object
+p2: .byte 0, 0, 0, 0, 0, 0, 0x10, 0x40
+ ASM_SIZE_DIRECTIVE(p2)
+ .type p63,@object
+p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+ ASM_SIZE_DIRECTIVE(p63)
+ .type p64,@object
+p64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+ ASM_SIZE_DIRECTIVE(p64)
+ .type p78,@object
+p78: .byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44
+ ASM_SIZE_DIRECTIVE(p78)
+ .type pm79,@object
+pm79: .byte 0, 0, 0, 0, 0, 0, 0, 0x3b
+ ASM_SIZE_DIRECTIVE(pm79)
+
+ .section .rodata.cst16,"aM",@progbits,16
+
+ .p2align 3
+ .type infinity,@object
+inf_zero:
+infinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
+ ASM_SIZE_DIRECTIVE(infinity)
+ .type zero,@object
+zero: .double 0.0
+ ASM_SIZE_DIRECTIVE(zero)
+ .type minf_mzero,@object
+minf_mzero:
+minfinity:
+ .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
+mzero:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+ .text
+ENTRY(__ieee754_powl)
+ fldt 16(%esp) // y
+ fxam
+
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+
+ fnstsw
+ movb %ah, %dl
+ andb $0x45, %ah
+ cmpb $0x40, %ah // is y == 0 ?
+ je 11f
+
+ cmpb $0x05, %ah // is y == ±inf ?
+ je 12f
+
+ cmpb $0x01, %ah // is y == NaN ?
+ je 30f
+
+ fldt 4(%esp) // x : y
+
+ subl $8,%esp
+ cfi_adjust_cfa_offset (8)
+
+ fxam
+ fnstsw
+ movb %ah, %dh
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ je 20f // x is ±0
+
+ cmpb $0x05, %ah
+ je 15f // x is ±inf
+
+ cmpb $0x01, %ah
+ je 32f // x is NaN
+
+ fxch // y : x
+
+ /* fistpll raises invalid exception for |y| >= 1L<<63. */
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p63) // y : x
+ fnstsw
+ sahf
+ jnc 2f
+
+ /* First see whether `y' is a natural number. In this case we
+ can use a more precise algorithm. */
+ fld %st // y : y : x
+ fistpll (%esp) // y : x
+ fildll (%esp) // int(y) : y : x
+ fucomp %st(1) // y : x
+ fnstsw
+ sahf
+ je 9f
+
+ // If y has absolute value at most 0x1p-79, then any finite
+ // nonzero x will result in 1. Saturate y to those bounds to
+ // avoid underflow in the calculation of y*log2(x).
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(pm79) // y : x
+ fnstsw
+ sahf
+ jnc 3f
+ fstp %st(0) // pop y
+ fldl MO(pm79) // 0x1p-79 : x
+ testb $2, %dl
+ jnz 3f // y > 0
+ fchs // -0x1p-79 : x
+ jmp 3f
+
+9: /* OK, we have an integer value for y. Unless very small
+ (we use < 4), use the algorithm for real exponent to avoid
+ accumulation of errors. */
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p2) // y : x
+ fnstsw
+ sahf
+ jnc 3f
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ orl $0, %edx
+ fstp %st(0) // x
+ jns 4f // y >= 0, jump
+ fdivrl MO(one) // 1/x (now referred to as x)
+ negl %eax
+ adcl $0, %edx
+ negl %edx
+4: fldl MO(one) // 1 : x
+ fxch
+
+ /* If y is even, take the absolute value of x. Otherwise,
+ ensure all intermediate values that might overflow have the
+ sign of x. */
+ testb $1, %al
+ jnz 6f
+ fabs
+
+6: shrdl $1, %edx, %eax
+ jnc 5f
+ fxch
+ fabs
+ fmul %st(1) // x : ST*x
+ fxch
+5: fld %st // x : x : ST*x
+ fabs // |x| : x : ST*x
+ fmulp // |x|*x : ST*x
+ shrl $1, %edx
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jnz 6b
+ fstp %st(0) // ST*x
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ LDBL_CHECK_FORCE_UFLOW_NONNAN
+ ret
+
+ /* y is ±NAN */
+30: fldt 4(%esp) // x : y
+ fldl MO(one) // 1.0 : x : y
+ fucomp %st(1) // x : y
+ fnstsw
+ sahf
+ je 33f
+31: /* At least one argument NaN, and result should be NaN. */
+ faddp
+ ret
+33: jp 31b
+ /* pow (1, NaN); check if the NaN signaling. */
+ testb $0x40, 23(%esp)
+ jz 31b
+ fstp %st(1)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+32: addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ faddp
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+2: // y is a large integer (absolute value at least 1L<<63).
+ // If y has absolute value at least 1L<<78, then any finite
+ // nonzero x will result in 0 (underflow), 1 or infinity (overflow).
+ // Saturate y to those bounds to avoid overflow in the calculation
+ // of y*log2(x).
+ fld %st // y : y : x
+ fabs // |y| : y : x
+ fcompl MO(p78) // y : x
+ fnstsw
+ sahf
+ jc 3f
+ fstp %st(0) // pop y
+ fldl MO(p78) // 1L<<78 : x
+ testb $2, %dl
+ jz 3f // y > 0
+ fchs // -(1L<<78) : x
+ .align ALIGNARG(4)
+3: /* y is a real number. */
+ subl $28, %esp
+ cfi_adjust_cfa_offset (28)
+ fstpt 12(%esp) // x
+ fstpt (%esp) // <empty>
+ call HIDDEN_JUMPTARGET (__powl_helper) // <result>
+ addl $36, %esp
+ cfi_adjust_cfa_offset (-36)
+ ret
+
+ // pow(x,±0) = 1, unless x is sNaN
+ .align ALIGNARG(4)
+11: fstp %st(0) // pop y
+ fldt 4(%esp) // x
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 112f // x is NaN
+111: fstp %st(0)
+ fldl MO(one)
+ ret
+
+112: testb $0x40, 11(%esp)
+ jnz 111b
+ fadd %st(0)
+ ret
+
+ // y == ±inf
+ .align ALIGNARG(4)
+12: fstp %st(0) // pop y
+ fldl MO(one) // 1
+ fldt 4(%esp) // x : 1
+ fabs // abs(x) : 1
+ fucompp // < 1, == 1, or > 1
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x45, %ah
+ je 13f // jump if x is NaN
+
+ cmpb $0x40, %ah
+ je 14f // jump if |x| == 1
+
+ shlb $1, %ah
+ xorb %ah, %dl
+ andl $2, %edx
+ fldl MOX(inf_zero, %edx, 4)
+ ret
+
+ .align ALIGNARG(4)
+14: fldl MO(one)
+ ret
+
+ .align ALIGNARG(4)
+13: fldt 4(%esp) // load x == NaN
+ fadd %st(0)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±inf
+15: fstp %st(0) // y
+ testb $2, %dh
+ jz 16f // jump if x == +inf
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, but y
+ // may be odd unless we know |y| >= 1L<<64.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p64) // y
+ fnstsw
+ sahf
+ jnc 16f
+ fldl MO(p63) // p63 : y
+ fxch // y : p63
+ fprem // y%p63 : p63
+ fstp %st(1) // y%p63
+
+ // We must find out whether y is an odd integer.
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 17f
+
+ // OK, the value is an integer, but is it odd?
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 18f // jump if not odd
+ // It's an odd integer.
+ shrl $31, %edx
+ fldl MOX(minf_mzero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+16: fcompl MO(zero)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fnstsw
+ shrl $5, %eax
+ andl $8, %eax
+ fldl MOX(inf_zero, %eax, 1)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+17: shll $30, %edx // sign bit for y in right position
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+18: shrl $31, %edx
+ fldl MOX(inf_zero, %edx, 8)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±0
+20: fstp %st(0) // y
+ testb $2, %dl
+ jz 21f // y > 0
+
+ // x is ±0 and y is < 0. We must find out whether y is an odd integer.
+ testb $2, %dh
+ jz 25f
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, but y
+ // may be odd unless we know |y| >= 1L<<64.
+ fld %st // y : y
+ fabs // |y| : y
+ fcompl MO(p64) // y
+ fnstsw
+ sahf
+ jnc 25f
+ fldl MO(p63) // p63 : y
+ fxch // y : p63
+ fprem // y%p63 : p63
+ fstp %st(1) // y%p63
+
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 26f
+
+ // OK, the value is an integer, but is it odd?
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 27f // jump if not odd
+ // It's an odd integer.
+ // Raise divide-by-zero exception and get minus infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ fchs
+ ret
+
+ cfi_adjust_cfa_offset (8)
+25: fstp %st(0)
+26: addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+27: // Raise divide-by-zero exception and get infinity value.
+ fldl MO(one)
+ fdivl MO(zero)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+ .align ALIGNARG(4)
+ // x is ±0 and y is > 0. We must find out whether y is an odd integer.
+21: testb $2, %dh
+ jz 22f
+
+ // fistpll raises invalid exception for |y| >= 1L<<63, but y
+ // may be odd unless we know |y| >= 1L<<64.
+ fld %st // y : y
+ fcompl MO(p64) // y
+ fnstsw
+ sahf
+ jnc 22f
+ fldl MO(p63) // p63 : y
+ fxch // y : p63
+ fprem // y%p63 : p63
+ fstp %st(1) // y%p63
+
+ fld %st // y : y
+ fistpll (%esp) // y
+ fildll (%esp) // int(y) : y
+ fucompp // <empty>
+ fnstsw
+ sahf
+ jne 23f
+
+ // OK, the value is an integer, but is it odd?
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ andb $1, %al
+ jz 24f // jump if not odd
+ // It's an odd integer.
+ fldl MO(mzero)
+ ret
+
+ cfi_adjust_cfa_offset (8)
+22: fstp %st(0)
+23: addl $8, %esp // Don't use 2 x pop
+ cfi_adjust_cfa_offset (-8)
+24: fldl MO(zero)
+ ret
+
+END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
new file mode 100644
index 0000000000..1347b0468c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c
@@ -0,0 +1,3 @@
+/* Empty. This file is only meant to avoid compiling the file with the
+ same name in the libm-ieee754 directory. The code is not used since
+ there is an assembler version for all users of this file. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainder.S b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
new file mode 100644
index 0000000000..f7867aa90b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainder)
+ fldl 12(%esp)
+ fldl 4(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ ret
+END (__ieee754_remainder)
+strong_alias (__ieee754_remainder, __remainder_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
new file mode 100644
index 0000000000..cfd390bc69
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderf)
+ flds 8(%esp)
+ flds 4(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ ret
+END (__ieee754_remainderf)
+strong_alias (__ieee754_remainderf, __remainderf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
new file mode 100644
index 0000000000..5ec23a37a3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_remainderl)
+ fldt 16(%esp)
+ fldt 4(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ ret
+END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalb.S b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
new file mode 100644
index 0000000000..370924c29f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S
@@ -0,0 +1,100 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type zero_nan,@object
+zero_nan:
+ .double 0.0
+nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+ .text
+ENTRY(__ieee754_scalb)
+ fldl 12(%esp)
+ fxam
+ fnstsw
+ fldl 4(%esp)
+ andl $0x4700, %eax
+ cmpl $0x0700, %eax
+ je 1f
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 2f
+ fxam
+ fnstsw
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 3f
+ fld %st(1)
+ frndint
+ fcomp %st(2)
+ fnstsw
+ sahf
+ jne 4f
+ fscale
+ fstp %st(1)
+ DBL_NARROW_EVAL
+ ret
+
+ /* y is -inf */
+1: fxam
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fnstsw
+ movl 8(%esp), %edx
+ shrl $5, %eax
+ fstp %st
+ fstp %st
+ andl $0x80000000, %edx
+ andl $0x0228, %eax
+ cmpl $0x0028, %eax
+ je 4f
+ andl $8, %eax
+ shrl $27, %edx
+ addl %edx, %eax
+ fldl MOX(zero_nan, %eax, 1)
+ ret
+
+ /* The result is NaN, but we must not raise an exception.
+ So use a variable. */
+2: fstp %st
+ fstp %st
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl MO(nan)
+ ret
+
+ /* The first parameter is a NaN. Return it. */
+3: fstp %st(1)
+ ret
+
+ /* Return NaN and raise the invalid exception. */
+4: fstp %st
+ fstp %st
+ fldz
+ fdiv %st
+ ret
+END(__ieee754_scalb)
+strong_alias (__ieee754_scalb, __scalb_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
new file mode 100644
index 0000000000..4f2dfa3acf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S
@@ -0,0 +1,102 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type zero_nan,@object
+zero_nan:
+ .double 0.0
+nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+
+ .text
+ENTRY(__ieee754_scalbf)
+ flds 8(%esp)
+ fxam
+ fnstsw
+ flds 4(%esp)
+ andl $0x4700, %eax
+ cmpl $0x0700, %eax
+ je 1f
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 2f
+ fxam
+ fnstsw
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 3f
+ fld %st(1)
+ frndint
+ fcomp %st(2)
+ fnstsw
+ sahf
+ jne 4f
+ fscale
+ fstp %st(1)
+ FLT_NARROW_EVAL
+ ret
+
+ /* y is -inf */
+1: fxam
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fnstsw
+ movl 4(%esp), %edx
+ shrl $5, %eax
+ fstp %st
+ fstp %st
+ andl $0x80000000, %edx
+ andl $0x0228, %eax
+ cmpl $0x0028, %eax
+ je 4f
+ andl $8, %eax
+ shrl $27, %edx
+ addl %edx, %eax
+ fldl MOX(zero_nan, %eax, 1)
+ ret
+
+ /* The result is NaN, but we must not raise an exception.
+ So use a variable. */
+2: fstp %st
+ fstp %st
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl MO(nan)
+ ret
+
+ /* The first parameter is a NaN. Return it. */
+3: fstp %st(1)
+ ret
+
+ /* Return NaN and raise the invalid exception. */
+4: fstp %st
+ fstp %st
+ fldz
+ fdiv %st
+ ret
+END(__ieee754_scalbf)
+strong_alias (__ieee754_scalbf, __scalbf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
new file mode 100644
index 0000000000..896f599cb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S
@@ -0,0 +1,90 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Correct handling of y==-inf <drepper@gnu>
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type zero_nan,@object
+zero_nan:
+ .double 0.0
+nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80
+ .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
+ ASM_SIZE_DIRECTIVE(zero_nan)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+#else
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
+#endif
+
+ .text
+ENTRY(__ieee754_scalbl)
+ fldt 16(%esp)
+ fxam
+ fnstsw
+ fldt 4(%esp)
+ andl $0x4700, %eax
+ cmpl $0x0700, %eax
+ je 1f
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 2f
+ fxam
+ fnstsw
+ andl $0x4500, %eax
+ cmpl $0x0100, %eax
+ je 2f
+ fld %st(1)
+ frndint
+ fcomp %st(2)
+ fnstsw
+ sahf
+ jne 4f
+ fscale
+ fstp %st(1)
+ ret
+
+ /* y is -inf */
+1: fxam
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fnstsw
+ movl 12(%esp), %edx
+ shrl $5, %eax
+ fstp %st
+ fstp %st
+ andl $0x8000, %edx
+ andl $0x0228, %eax
+ cmpl $0x0028, %eax
+ je 4f
+ andl $8, %eax
+ shrl $11, %edx
+ addl %edx, %eax
+ fldl MOX(zero_nan, %eax, 1)
+ ret
+
+ /* The result is NaN; raise an exception for sNaN arguments. */
+2: faddp
+ ret
+
+ /* Return NaN and raise the invalid exception. */
+4: fstp %st
+ fstp %st
+ fldz
+ fdiv %st
+ ret
+END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
new file mode 100644
index 0000000000..fba5833a9a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrt)
+ fldl 4(%esp)
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fstcw 4(%esp)
+ movl $0xfeff, %edx
+ andl 4(%esp), %edx
+ movl %edx, (%esp)
+ fldcw (%esp)
+ fsqrt
+ fldcw 4(%esp)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ ret
+END (__ieee754_sqrt)
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
new file mode 100644
index 0000000000..6f7e4b015f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S
@@ -0,0 +1,13 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__ieee754_sqrtf)
+ flds 4(%esp)
+ fsqrt
+ ret
+END (__ieee754_sqrtf)
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
new file mode 100644
index 0000000000..41bcd7eeb7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+#undef __ieee754_sqrtl
+long double
+__ieee754_sqrtl (long double x)
+{
+ long double res;
+
+ asm ("fsqrt" : "=t" (res) : "0" (x));
+
+ return res;
+}
+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
new file mode 100644
index 0000000000..5d8596964b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c
@@ -0,0 +1,69 @@
+/* Clear given exceptions in current floating-point environment.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feclearexcept (int excepts)
+{
+ fenv_t temp;
+
+ /* Mask out unsupported bits/exceptions. */
+ excepts &= FE_ALL_EXCEPT;
+
+ /* Bah, we have to clear selected exceptions. Since there is no
+ `fldsw' instruction we have to do it the hard way. */
+ __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+ /* Clear the relevant bits. */
+ temp.__status_word &= excepts ^ FE_ALL_EXCEPT;
+
+ /* Put the new data in effect. */
+ __asm__ ("fldenv %0" : : "m" (*&temp));
+
+ /* If the CPU supports SSE, we clear the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xnew_exc;
+
+ /* Get the current MXCSR. */
+ __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+ /* Clear the relevant bits. */
+ xnew_exc &= ~excepts;
+
+ /* Put the new data in effect. */
+ __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+ }
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feclearexcept, __old_feclearexcept)
+compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feclearexcept, feclearexcept)
+versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
new file mode 100644
index 0000000000..f8db665425
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c
@@ -0,0 +1,54 @@
+/* Disable floating-point exceptions.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fedisableexcept (int excepts)
+{
+ unsigned short int new_exc, old_exc;
+
+ /* Get the current control word. */
+ __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+ old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+ excepts &= FE_ALL_EXCEPT;
+
+ new_exc |= excepts;
+ __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+ /* If the CPU supports SSE we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xnew_exc;
+
+ /* Get the current control word. */
+ __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+ xnew_exc |= excepts << 7;
+
+ __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+ }
+
+ return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
new file mode 100644
index 0000000000..f1c42d7c27
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c
@@ -0,0 +1,54 @@
+/* Enable floating-point exceptions.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+feenableexcept (int excepts)
+{
+ unsigned short int new_exc;
+ unsigned short int old_exc;
+
+ /* Get the current control word. */
+ __asm__ ("fstcw %0" : "=m" (*&new_exc));
+
+ excepts &= FE_ALL_EXCEPT;
+ old_exc = (~new_exc) & FE_ALL_EXCEPT;
+
+ new_exc &= ~excepts;
+ __asm__ ("fldcw %0" : : "m" (*&new_exc));
+
+ /* If the CPU supports SSE we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xnew_exc;
+
+ /* Get the current control word. */
+ __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+ xnew_exc &= ~(excepts << 7);
+
+ __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+ }
+
+ return old_exc;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetenv.c b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
new file mode 100644
index 0000000000..983f6af25e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c
@@ -0,0 +1,49 @@
+/* Store current floating-point environment.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+ __asm__ ("fnstenv %0" : "=m" (*envp));
+ /* And load it right back since the processor changes the mask.
+ Intel thought this opcode to be used in interrupt handlers which
+ would block all exceptions. */
+ __asm__ ("fldenv %0" : : "m" (*envp));
+
+ if (HAS_CPU_FEATURE (SSE))
+ __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetenv, __old_fegetenv)
+compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fegetenv)
+libm_hidden_ver (__fegetenv, fegetenv)
+versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
new file mode 100644
index 0000000000..dc87b7a470
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c
@@ -0,0 +1,31 @@
+/* Get enabled floating-point exceptions.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Andreas Jaeger <aj@suse.de>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+
+int
+fegetexcept (void)
+{
+ unsigned short int exc;
+
+ /* Get the current control word. */
+ __asm__ ("fstcw %0" : "=m" (*&exc));
+
+ return (~exc) & FE_ALL_EXCEPT;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetmode.c b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
new file mode 100644
index 0000000000..abbce3075f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c
@@ -0,0 +1,32 @@
+/* Store current floating-point control modes. i386 version.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+fegetmode (femode_t *modep)
+{
+ _FPU_GETCW (modep->__control_word);
+ if (HAS_CPU_FEATURE (SSE))
+ __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr));
+ return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetround.c b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
new file mode 100644
index 0000000000..8ce8b859d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fegetround.c
@@ -0,0 +1,33 @@
+/* Return current rounding direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+
+int
+__fegetround (void)
+{
+ int cw;
+
+ __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+ return cw & 0xc00;
+}
+libm_hidden_def (__fegetround)
+weak_alias (__fegetround, fegetround)
+libm_hidden_weak (fegetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
new file mode 100644
index 0000000000..d327358913
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c
@@ -0,0 +1,50 @@
+/* Store current floating-point environment and clear exceptions.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__feholdexcept (fenv_t *envp)
+{
+ /* Store the environment. Recall that fnstenv has a side effect of
+ masking all exceptions. Then clear all exceptions. */
+ __asm__ volatile ("fnstenv %0; fnclex" : "=m" (*envp));
+
+ /* If the CPU supports SSE we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xwork;
+
+ /* Get the current control word. */
+ __asm__ ("stmxcsr %0" : "=m" (envp->__eip));
+
+ /* Set all exceptions to non-stop and clear them. */
+ xwork = (envp->__eip | 0x1f80) & ~0x3f;
+
+ __asm__ ("ldmxcsr %0" : : "m" (*&xwork));
+ }
+
+ return 0;
+}
+libm_hidden_def (__feholdexcept)
+weak_alias (__feholdexcept, feholdexcept)
+libm_hidden_weak (feholdexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/fenv_private.h b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
new file mode 100644
index 0000000000..e20e1f1662
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h
@@ -0,0 +1,501 @@
+#ifndef FENV_PRIVATE_H
+#define FENV_PRIVATE_H 1
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+#ifdef __SSE2_MATH__
+# define math_opt_barrier(x) \
+ ({ __typeof(x) __x; \
+ if (sizeof (x) <= sizeof (double)) \
+ __asm ("" : "=x" (__x) : "0" (x)); \
+ else \
+ __asm ("" : "=t" (__x) : "0" (x)); \
+ __x; })
+# define math_force_eval(x) \
+ do { \
+ if (sizeof (x) <= sizeof (double)) \
+ __asm __volatile ("" : : "x" (x)); \
+ else \
+ __asm __volatile ("" : : "f" (x)); \
+ } while (0)
+#else
+# define math_opt_barrier(x) \
+ ({ __typeof (x) __x; \
+ __asm ("" : "=t" (__x) : "0" (x)); \
+ __x; })
+# define math_force_eval(x) \
+ do { \
+ __typeof (x) __x = (x); \
+ if (sizeof (x) <= sizeof (double)) \
+ __asm __volatile ("" : : "m" (__x)); \
+ else \
+ __asm __volatile ("" : : "f" (__x)); \
+ } while (0)
+#endif
+
+/* This file is used by both the 32- and 64-bit ports. The 64-bit port
+ has a field in the fenv_t for the mxcsr; the 32-bit port does not.
+ Instead, we (ab)use the only 32-bit field extant in the struct. */
+#ifndef __x86_64__
+# define __mxcsr __eip
+#endif
+
+
+/* All of these functions are private to libm, and are all used in pairs
+ to save+change the fp state and restore the original state. Thus we
+ need not care for both the 387 and the sse unit, only the one we're
+ actually using. */
+
+#if defined __AVX__ || defined SSE2AVX
+# define STMXCSR "vstmxcsr"
+# define LDMXCSR "vldmxcsr"
+#else
+# define STMXCSR "stmxcsr"
+# define LDMXCSR "ldmxcsr"
+#endif
+
+static __always_inline void
+libc_feholdexcept_sse (fenv_t *e)
+{
+ unsigned int mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ e->__mxcsr = mxcsr;
+ mxcsr = (mxcsr | 0x1f80) & ~0x3f;
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdexcept_387 (fenv_t *e)
+{
+ /* Recall that fnstenv has a side-effect of masking exceptions.
+ Clobber all of the fp registers so that the TOS field is 0. */
+ asm volatile ("fnstenv %0; fnclex"
+ : "=m"(*e)
+ : : "st", "st(1)", "st(2)", "st(3)",
+ "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline void
+libc_fesetround_sse (int r)
+{
+ unsigned int mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ mxcsr = (mxcsr & ~0x6000) | (r << 3);
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_fesetround_387 (int r)
+{
+ fpu_control_t cw;
+ _FPU_GETCW (cw);
+ cw = (cw & ~0xc00) | r;
+ _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_sse (fenv_t *e, int r)
+{
+ unsigned int mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ e->__mxcsr = mxcsr;
+ mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+/* Set both rounding mode and precision. A convenience function for use
+ by libc_feholdexcept_setround and libc_feholdexcept_setround_53bit. */
+static __always_inline void
+libc_feholdexcept_setround_387_prec (fenv_t *e, int r)
+{
+ libc_feholdexcept_387 (e);
+
+ fpu_control_t cw = e->__control_word;
+ cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+ cw |= r | 0x3f;
+ _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387 (fenv_t *e, int r)
+{
+ libc_feholdexcept_setround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit (fenv_t *e, int r)
+{
+ libc_feholdexcept_setround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline int
+libc_fetestexcept_sse (int e)
+{
+ unsigned int mxcsr;
+ asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+ return mxcsr & e & FE_ALL_EXCEPT;
+}
+
+static __always_inline int
+libc_fetestexcept_387 (int ex)
+{
+ fexcept_t temp;
+ asm volatile ("fnstsw %0" : "=a" (temp));
+ return temp & ex & FE_ALL_EXCEPT;
+}
+
+static __always_inline void
+libc_fesetenv_sse (fenv_t *e)
+{
+ asm volatile (LDMXCSR " %0" : : "m" (e->__mxcsr));
+}
+
+static __always_inline void
+libc_fesetenv_387 (fenv_t *e)
+{
+ /* Clobber all fp registers so that the TOS value we saved earlier is
+ compatible with the current state of the compiler. */
+ asm volatile ("fldenv %0"
+ : : "m" (*e)
+ : "st", "st(1)", "st(2)", "st(3)",
+ "st(4)", "st(5)", "st(6)", "st(7)");
+}
+
+static __always_inline int
+libc_feupdateenv_test_sse (fenv_t *e, int ex)
+{
+ unsigned int mxcsr, old_mxcsr, cur_ex;
+ asm volatile (STMXCSR " %0" : "=m" (*&mxcsr));
+ cur_ex = mxcsr & FE_ALL_EXCEPT;
+
+ /* Merge current exceptions with the old environment. */
+ old_mxcsr = e->__mxcsr;
+ mxcsr = old_mxcsr | cur_ex;
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+
+ /* Raise SIGFPE for any new exceptions since the hold. Expect that
+ the normal environment has all exceptions masked. */
+ if (__glibc_unlikely (~(old_mxcsr >> 7) & cur_ex))
+ __feraiseexcept (cur_ex);
+
+ /* Test for exceptions raised since the hold. */
+ return cur_ex & ex;
+}
+
+static __always_inline int
+libc_feupdateenv_test_387 (fenv_t *e, int ex)
+{
+ fexcept_t cur_ex;
+
+ /* Save current exceptions. */
+ asm volatile ("fnstsw %0" : "=a" (cur_ex));
+ cur_ex &= FE_ALL_EXCEPT;
+
+ /* Reload original environment. */
+ libc_fesetenv_387 (e);
+
+ /* Merge current exceptions. */
+ __feraiseexcept (cur_ex);
+
+ /* Test for exceptions raised since the hold. */
+ return cur_ex & ex;
+}
+
+static __always_inline void
+libc_feupdateenv_sse (fenv_t *e)
+{
+ libc_feupdateenv_test_sse (e, 0);
+}
+
+static __always_inline void
+libc_feupdateenv_387 (fenv_t *e)
+{
+ libc_feupdateenv_test_387 (e, 0);
+}
+
+static __always_inline void
+libc_feholdsetround_sse (fenv_t *e, int r)
+{
+ unsigned int mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ e->__mxcsr = mxcsr;
+ mxcsr = (mxcsr & ~0x6000) | (r << 3);
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec (fenv_t *e, int r)
+{
+ fpu_control_t cw;
+
+ _FPU_GETCW (cw);
+ e->__control_word = cw;
+ cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+ cw |= r;
+ _FPU_SETCW (cw);
+}
+
+static __always_inline void
+libc_feholdsetround_387 (fenv_t *e, int r)
+{
+ libc_feholdsetround_387_prec (e, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit (fenv_t *e, int r)
+{
+ libc_feholdsetround_387_prec (e, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feresetround_sse (fenv_t *e)
+{
+ unsigned int mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000);
+ asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
+}
+
+static __always_inline void
+libc_feresetround_387 (fenv_t *e)
+{
+ _FPU_SETCW (e->__control_word);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexceptf libc_feholdexcept_sse
+# define libc_fesetroundf libc_fesetround_sse
+# define libc_feholdexcept_setroundf libc_feholdexcept_setround_sse
+# define libc_fetestexceptf libc_fetestexcept_sse
+# define libc_fesetenvf libc_fesetenv_sse
+# define libc_feupdateenv_testf libc_feupdateenv_test_sse
+# define libc_feupdateenvf libc_feupdateenv_sse
+# define libc_feholdsetroundf libc_feholdsetround_sse
+# define libc_feresetroundf libc_feresetround_sse
+#else
+# define libc_feholdexceptf libc_feholdexcept_387
+# define libc_fesetroundf libc_fesetround_387
+# define libc_feholdexcept_setroundf libc_feholdexcept_setround_387
+# define libc_fetestexceptf libc_fetestexcept_387
+# define libc_fesetenvf libc_fesetenv_387
+# define libc_feupdateenv_testf libc_feupdateenv_test_387
+# define libc_feupdateenvf libc_feupdateenv_387
+# define libc_feholdsetroundf libc_feholdsetround_387
+# define libc_feresetroundf libc_feresetround_387
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept libc_feholdexcept_sse
+# define libc_fesetround libc_fesetround_sse
+# define libc_feholdexcept_setround libc_feholdexcept_setround_sse
+# define libc_fetestexcept libc_fetestexcept_sse
+# define libc_fesetenv libc_fesetenv_sse
+# define libc_feupdateenv_test libc_feupdateenv_test_sse
+# define libc_feupdateenv libc_feupdateenv_sse
+# define libc_feholdsetround libc_feholdsetround_sse
+# define libc_feresetround libc_feresetround_sse
+#else
+# define libc_feholdexcept libc_feholdexcept_387
+# define libc_fesetround libc_fesetround_387
+# define libc_feholdexcept_setround libc_feholdexcept_setround_387
+# define libc_fetestexcept libc_fetestexcept_387
+# define libc_fesetenv libc_fesetenv_387
+# define libc_feupdateenv_test libc_feupdateenv_test_387
+# define libc_feupdateenv libc_feupdateenv_387
+# define libc_feholdsetround libc_feholdsetround_387
+# define libc_feresetround libc_feresetround_387
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexceptl libc_feholdexcept_387
+#define libc_fesetroundl libc_fesetround_387
+#define libc_feholdexcept_setroundl libc_feholdexcept_setround_387
+#define libc_fetestexceptl libc_fetestexcept_387
+#define libc_fesetenvl libc_fesetenv_387
+#define libc_feupdateenv_testl libc_feupdateenv_test_387
+#define libc_feupdateenvl libc_feupdateenv_387
+#define libc_feholdsetroundl libc_feholdsetround_387
+#define libc_feresetroundl libc_feresetround_387
+
+#ifndef __SSE2_MATH__
+# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround_387_53bit
+# define libc_feholdsetround_53bit libc_feholdsetround_387_53bit
+#endif
+
+/* We have support for rounding mode context. */
+#define HAVE_RM_CTX 1
+
+static __always_inline void
+libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+ unsigned int mxcsr, new_mxcsr;
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
+
+ ctx->env.__mxcsr = mxcsr;
+ if (__glibc_unlikely (mxcsr != new_mxcsr))
+ {
+ asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+ ctx->updated_status = true;
+ }
+ else
+ ctx->updated_status = false;
+}
+
+/* Unconditional since we want to overwrite any exceptions that occurred in the
+ context. This is also why all fehold* functions unconditionally write into
+ ctx->env. */
+static __always_inline void
+libc_fesetenv_sse_ctx (struct rm_ctx *ctx)
+{
+ libc_fesetenv_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feupdateenv_sse_ctx (struct rm_ctx *ctx)
+{
+ if (__glibc_unlikely (ctx->updated_status))
+ libc_feupdateenv_test_sse (&ctx->env, 0);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+ libc_feholdexcept_387 (&ctx->env);
+
+ fpu_control_t cw = ctx->env.__control_word;
+ fpu_control_t old_cw = cw;
+ cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+ cw |= r | 0x3f;
+
+ if (__glibc_unlikely (old_cw != cw))
+ {
+ _FPU_SETCW (cw);
+ ctx->updated_status = true;
+ }
+ else
+ ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r)
+{
+ libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+ libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r)
+{
+ fpu_control_t cw, new_cw;
+
+ _FPU_GETCW (cw);
+ new_cw = cw;
+ new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
+ new_cw |= r;
+
+ ctx->env.__control_word = cw;
+ if (__glibc_unlikely (new_cw != cw))
+ {
+ _FPU_SETCW (new_cw);
+ ctx->updated_status = true;
+ }
+ else
+ ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r)
+{
+ libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
+}
+
+static __always_inline void
+libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r)
+{
+ libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
+}
+
+static __always_inline void
+libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
+{
+ unsigned int mxcsr, new_mxcsr;
+
+ asm (STMXCSR " %0" : "=m" (*&mxcsr));
+ new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
+
+ ctx->env.__mxcsr = mxcsr;
+ if (__glibc_unlikely (new_mxcsr != mxcsr))
+ {
+ asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
+ ctx->updated_status = true;
+ }
+ else
+ ctx->updated_status = false;
+}
+
+static __always_inline void
+libc_feresetround_sse_ctx (struct rm_ctx *ctx)
+{
+ if (__glibc_unlikely (ctx->updated_status))
+ libc_feresetround_sse (&ctx->env);
+}
+
+static __always_inline void
+libc_feresetround_387_ctx (struct rm_ctx *ctx)
+{
+ if (__glibc_unlikely (ctx->updated_status))
+ _FPU_SETCW (ctx->env.__control_word);
+}
+
+static __always_inline void
+libc_feupdateenv_387_ctx (struct rm_ctx *ctx)
+{
+ if (__glibc_unlikely (ctx->updated_status))
+ libc_feupdateenv_test_387 (&ctx->env, 0);
+}
+
+#ifdef __SSE_MATH__
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenvf_ctx libc_fesetenv_sse_ctx
+# define libc_feupdateenvf_ctx libc_feupdateenv_sse_ctx
+# define libc_feholdsetroundf_ctx libc_feholdsetround_sse_ctx
+# define libc_feresetroundf_ctx libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenvf_ctx libc_feupdateenv_387_ctx
+# define libc_feholdsetroundf_ctx libc_feholdsetround_387_ctx
+# define libc_feresetroundf_ctx libc_feresetround_387_ctx
+#endif /* __SSE_MATH__ */
+
+#ifdef __SSE2_MATH__
+# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_sse_ctx
+# define libc_fesetenv_ctx libc_fesetenv_sse_ctx
+# define libc_feupdateenv_ctx libc_feupdateenv_sse_ctx
+# define libc_feholdsetround_ctx libc_feholdsetround_sse_ctx
+# define libc_feresetround_ctx libc_feresetround_sse_ctx
+#else
+# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_387_ctx
+# define libc_feupdateenv_ctx libc_feupdateenv_387_ctx
+# define libc_feholdsetround_ctx libc_feholdsetround_387_ctx
+# define libc_feresetround_ctx libc_feresetround_387_ctx
+#endif /* __SSE2_MATH__ */
+
+#define libc_feholdexcept_setroundl_ctx libc_feholdexcept_setround_387_ctx
+#define libc_feupdateenvl_ctx libc_feupdateenv_387_ctx
+#define libc_feholdsetroundl_ctx libc_feholdsetround_387_ctx
+#define libc_feresetroundl_ctx libc_feresetround_387_ctx
+
+#ifndef __SSE2_MATH__
+# define libc_feholdsetround_53bit_ctx libc_feholdsetround_387_53bit_ctx
+# define libc_feresetround_53bit_ctx libc_feresetround_387_ctx
+#endif
+
+#undef __mxcsr
+
+#endif /* FENV_PRIVATE_H */
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetenv.c b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
new file mode 100644
index 0000000000..a338e5d555
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c
@@ -0,0 +1,131 @@
+/* Install given floating-point environment.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <assert.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+/* All exceptions, including the x86-specific "denormal operand"
+ exception. */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+
+int
+__fesetenv (const fenv_t *envp)
+{
+ fenv_t temp;
+
+ /* The memory block used by fstenv/fldenv has a size of 28 bytes. */
+ assert (sizeof (fenv_t) == 28);
+
+ /* Install the environment specified by ENVP. But there are a few
+ values which we do not want to come from the saved environment.
+ Therefore, we get the current environment and replace the values
+ we want to use from the environment specified by the parameter. */
+ __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+ if (envp == FE_DFL_ENV)
+ {
+ temp.__control_word |= FE_ALL_EXCEPT_X86;
+ temp.__control_word &= ~FE_TOWARDZERO;
+ temp.__control_word |= _FPU_EXTENDED;
+ temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+ }
+ else if (envp == FE_NOMASK_ENV)
+ {
+ temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
+ /* Keep the "denormal operand" exception masked. */
+ temp.__control_word |= __FE_DENORM;
+ temp.__control_word |= _FPU_EXTENDED;
+ temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+ }
+ else
+ {
+ temp.__control_word &= ~(FE_ALL_EXCEPT_X86
+ | FE_TOWARDZERO
+ | _FPU_EXTENDED);
+ temp.__control_word |= (envp->__control_word
+ & (FE_ALL_EXCEPT_X86
+ | FE_TOWARDZERO
+ | _FPU_EXTENDED));
+ temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+ temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86;
+ }
+ temp.__eip = 0;
+ temp.__cs_selector = 0;
+ temp.__opcode = 0;
+ temp.__data_offset = 0;
+ temp.__data_selector = 0;
+
+ __asm__ ("fldenv %0" : : "m" (temp));
+
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int mxcsr;
+ __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+
+ if (envp == FE_DFL_ENV)
+ {
+ /* Clear SSE exceptions. */
+ mxcsr &= ~FE_ALL_EXCEPT_X86;
+ /* Set mask for SSE MXCSR. */
+ mxcsr |= (FE_ALL_EXCEPT_X86 << 7);
+ /* Set rounding to FE_TONEAREST. */
+ mxcsr &= ~0x6000;
+ mxcsr |= (FE_TONEAREST << 3);
+ /* Clear the FZ and DAZ bits. */
+ mxcsr &= ~0x8040;
+ }
+ else if (envp == FE_NOMASK_ENV)
+ {
+ /* Clear SSE exceptions. */
+ mxcsr &= ~FE_ALL_EXCEPT_X86;
+ /* Do not mask exceptions. */
+ mxcsr &= ~(FE_ALL_EXCEPT << 7);
+ /* Keep the "denormal operand" exception masked. */
+ mxcsr |= (__FE_DENORM << 7);
+ /* Set rounding to FE_TONEAREST. */
+ mxcsr &= ~0x6000;
+ mxcsr |= (FE_TONEAREST << 3);
+ /* Clear the FZ and DAZ bits. */
+ mxcsr &= ~0x8040;
+ }
+ else
+ mxcsr = envp->__eip;
+
+ __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+ }
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetenv, __old_fesetenv)
+compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__fesetenv)
+libm_hidden_ver (__fesetenv, fesetenv)
+versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
new file mode 100644
index 0000000000..adfcf17ba6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c
@@ -0,0 +1,31 @@
+/* Set given exception flags. i386 version.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+
+int
+fesetexcept (int excepts)
+{
+ fenv_t temp;
+
+ __asm__ ("fnstenv %0" : "=m" (*&temp));
+ temp.__status_word |= excepts & FE_ALL_EXCEPT;
+ __asm__ ("fldenv %0" : : "m" (*&temp));
+
+ return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetmode.c b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
new file mode 100644
index 0000000000..bd9f74cd97
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c
@@ -0,0 +1,54 @@
+/* Install given floating-point control modes. i386 version.
+ Copyright (C) 2016-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+/* All exceptions, including the x86-specific "denormal operand"
+ exception. */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+int
+fesetmode (const femode_t *modep)
+{
+ fpu_control_t cw;
+ if (modep == FE_DFL_MODE)
+ cw = _FPU_DEFAULT;
+ else
+ cw = modep->__control_word;
+ _FPU_SETCW (cw);
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int mxcsr;
+ __asm__ ("stmxcsr %0" : "=m" (mxcsr));
+ /* Preserve SSE exception flags but restore other state in
+ MXCSR. */
+ mxcsr &= FE_ALL_EXCEPT_X86;
+ if (modep == FE_DFL_MODE)
+ /* Default MXCSR state has all bits zero except for those
+ masking exceptions. */
+ mxcsr |= FE_ALL_EXCEPT_X86 << 7;
+ else
+ mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
+ __asm__ ("ldmxcsr %0" : : "m" (mxcsr));
+ }
+ return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetround.c b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
new file mode 100644
index 0000000000..a3fa6235c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fesetround.c
@@ -0,0 +1,54 @@
+/* Set current rounding direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetround (int round)
+{
+ unsigned short int cw;
+
+ if ((round & ~0xc00) != 0)
+ /* ROUND is no valid rounding mode. */
+ return 1;
+
+ __asm__ ("fnstcw %0" : "=m" (*&cw));
+ cw &= ~0xc00;
+ cw |= round;
+ __asm__ ("fldcw %0" : : "m" (*&cw));
+
+ /* If the CPU supports SSE we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xcw;
+
+ __asm__ ("stmxcsr %0" : "=m" (*&xcw));
+ xcw &= ~0x6000;
+ xcw |= round << 3;
+ __asm__ ("ldmxcsr %0" : : "m" (*&xcw));
+ }
+
+ return 0;
+}
+libm_hidden_def (__fesetround)
+weak_alias (__fesetround, fesetround)
+libm_hidden_weak (fesetround)
diff --git a/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
new file mode 100644
index 0000000000..b610289cd0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c
@@ -0,0 +1,60 @@
+/* Install given floating-point environment and raise exceptions.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+ fexcept_t temp;
+ unsigned int xtemp = 0;
+
+ /* Save current exceptions. */
+ __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+ /* If the CPU supports SSE we test the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+ temp = (temp | xtemp) & FE_ALL_EXCEPT;
+
+ /* Install new environment. */
+ __fesetenv (envp);
+
+ /* Raise the saved exception. Incidently for us the implementation
+ defined format of the values in objects of type fexcept_t is the
+ same as the ones specified using the FE_* constants. */
+ __feraiseexcept ((int) temp);
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feupdateenv)
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
new file mode 100644
index 0000000000..954e5f69d8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c
@@ -0,0 +1,57 @@
+/* Store current representation for exceptions.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+
+int
+__fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+ fexcept_t temp;
+
+ /* Get the current exceptions. */
+ __asm__ ("fnstsw %0" : "=m" (*&temp));
+
+ *flagp = temp & excepts & FE_ALL_EXCEPT;
+
+ /* If the CPU supports SSE, we clear the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int sse_exc;
+
+ /* Get the current MXCSR. */
+ __asm__ ("stmxcsr %0" : "=m" (*&sse_exc));
+
+ *flagp |= sse_exc & excepts & FE_ALL_EXCEPT;
+ }
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetexceptflag, __old_fegetexceptflag)
+compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
new file mode 100644
index 0000000000..913d7b912c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c
@@ -0,0 +1,124 @@
+/* Raise given exceptions.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <math.h>
+
+int
+__feraiseexcept (int excepts)
+{
+ /* Raise exceptions represented by EXPECTS. But we must raise only
+ one signal at a time. It is important that if the overflow/underflow
+ exception and the inexact exception are given at the same time,
+ the overflow/underflow exception follows the inexact exception. */
+
+ /* First: invalid exception. */
+ if ((FE_INVALID & excepts) != 0)
+ {
+ /* One example of an invalid operation is 0.0 / 0.0. */
+ double d;
+ __asm__ __volatile__ ("fldz; fdiv %%st, %%st(0); fwait" : "=t" (d));
+ (void) &d;
+ }
+
+ /* Next: division by zero. */
+ if ((FE_DIVBYZERO & excepts) != 0)
+ {
+ double d;
+ __asm__ __volatile__ ("fldz; fld1; fdivp %%st, %%st(1); fwait"
+ : "=t" (d));
+ (void) &d;
+ }
+
+ /* Next: overflow. */
+ if ((FE_OVERFLOW & excepts) != 0)
+ {
+ /* There is no way to raise only the overflow flag. Do it the
+ hard way. */
+ fenv_t temp;
+
+ /* Bah, we have to clear selected exceptions. Since there is no
+ `fldsw' instruction we have to do it the hard way. */
+ __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+ /* Set the relevant bits. */
+ temp.__status_word |= FE_OVERFLOW;
+
+ /* Put the new data in effect. */
+ __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+ /* And raise the exception. */
+ __asm__ __volatile__ ("fwait");
+ }
+
+ /* Next: underflow. */
+ if ((FE_UNDERFLOW & excepts) != 0)
+ {
+ /* There is no way to raise only the underflow flag. Do it the
+ hard way. */
+ fenv_t temp;
+
+ /* Bah, we have to clear selected exceptions. Since there is no
+ `fldsw' instruction we have to do it the hard way. */
+ __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+ /* Set the relevant bits. */
+ temp.__status_word |= FE_UNDERFLOW;
+
+ /* Put the new data in effect. */
+ __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+ /* And raise the exception. */
+ __asm__ __volatile__ ("fwait");
+ }
+
+ /* Last: inexact. */
+ if ((FE_INEXACT & excepts) != 0)
+ {
+ /* There is no way to raise only the inexact flag. Do it the
+ hard way. */
+ fenv_t temp;
+
+ /* Bah, we have to clear selected exceptions. Since there is no
+ `fldsw' instruction we have to do it the hard way. */
+ __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp));
+
+ /* Set the relevant bits. */
+ temp.__status_word |= FE_INEXACT;
+
+ /* Put the new data in effect. */
+ __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp));
+
+ /* And raise the exception. */
+ __asm__ __volatile__ ("fwait");
+ }
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feraiseexcept, __old_feraiseexcept)
+compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_def (__feraiseexcept)
+libm_hidden_ver (__feraiseexcept, feraiseexcept)
+versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
new file mode 100644
index 0000000000..efa64aaefd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c
@@ -0,0 +1,69 @@
+/* Set floating-point environment exception handling.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <math.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+int
+__fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+ fenv_t temp;
+
+ /* Get the current environment. We have to do this since we cannot
+ separately set the status word. */
+ __asm__ ("fnstenv %0" : "=m" (*&temp));
+
+ temp.__status_word &= ~(excepts & FE_ALL_EXCEPT);
+ temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT;
+
+ /* Store the new status word (along with the rest of the environment.
+ Possibly new exceptions are set but they won't get executed unless
+ the next floating-point instruction. */
+ __asm__ ("fldenv %0" : : "m" (*&temp));
+
+ /* If the CPU supports SSE, we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xnew_exc;
+
+ /* Get the current MXCSR. */
+ __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+ /* Set the relevant bits. */
+ xnew_exc &= ~(excepts & FE_ALL_EXCEPT);
+ xnew_exc |= *flagp & excepts & FE_ALL_EXCEPT;
+
+ /* Put the new data in effect. */
+ __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+ }
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetexceptflag, __old_fesetexceptflag)
+compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2);
diff --git a/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
new file mode 100644
index 0000000000..f523f9e709
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c
@@ -0,0 +1,40 @@
+/* Test exception in current environment.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <ldsodefs.h>
+
+int
+fetestexcept (int excepts)
+{
+ short temp;
+ int xtemp = 0;
+
+ /* Get current exceptions. */
+ __asm__ ("fnstsw %0" : "=a" (temp));
+
+ /* If the CPU supports SSE we test the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ __asm__ ("stmxcsr %0" : "=m" (*&xtemp));
+
+ return (temp | xtemp) & excepts & FE_ALL_EXCEPT;
+}
+libm_hidden_def (fetestexcept)
diff --git a/REORG.TODO/sysdeps/i386/fpu/halfulp.c b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/halfulp.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
new file mode 100644
index 0000000000..6ffc8e6f64
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h
@@ -0,0 +1,340 @@
+/* Helper macros for x86 libm functions.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _I386_MATH_ASM_H
+#define _I386_MATH_ASM_H 1
+
+/* Remove excess range and precision by storing a value on the stack
+ and loading it back. */
+#define FLT_NARROW_EVAL \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fstps (%esp); \
+ flds (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fstpl (%esp); \
+ fldl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8);
+
+/* Define constants for the minimum value of a floating-point
+ type. */
+#define DEFINE_FLT_MIN \
+ .section .rodata.cst4,"aM",@progbits,4; \
+ .p2align 2; \
+ .type flt_min,@object; \
+flt_min: \
+ .byte 0, 0, 0x80, 0; \
+ .size flt_min, .-flt_min;
+#define DEFINE_DBL_MIN \
+ .section .rodata.cst8,"aM",@progbits,8; \
+ .p2align 3; \
+ .type dbl_min,@object; \
+dbl_min: \
+ .byte 0, 0, 0, 0, 0, 0, 0x10, 0; \
+ .size dbl_min, .-dbl_min;
+#define DEFINE_LDBL_MIN \
+ .section .rodata.cst16,"aM",@progbits,16; \
+ .p2align 4; \
+ .type ldbl_min,@object; \
+ldbl_min: \
+ .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0; \
+ .byte 0, 0, 0, 0, 0, 0; \
+ .size ldbl_min, .-ldbl_min;
+
+/* Remove excess range and precision by storing a value on the stack
+ and loading it back. The value is given to be nonnegative or NaN;
+ if it is subnormal, also force an underflow exception. The
+ relevant constant for the minimum of the type must have been
+ defined, the MO macro must have been defined for access to memory
+ operands, and, if PIC, the PIC register must have been loaded. */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG_NAN \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ flds MO(flt_min); \
+ fld %st(1); \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+6424: fstps (%esp); \
+ flds (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG_NAN \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fldl MO(dbl_min); \
+ fld %st(1); \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+6453: fstpl (%esp); \
+ fldl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the argument is not a NaN (so fcom instructions,
+ which support memory operands, can be used). */
+#define FLT_NARROW_EVAL_UFLOW_NONNEG \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fcoms MO(flt_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+6424: fstps (%esp); \
+ flds (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNEG \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fcoml MO(dbl_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+6453: fstpl (%esp); \
+ fldl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8);
+
+/* Likewise, but the non-NaN argument may be negative. */
+#define FLT_NARROW_EVAL_UFLOW_NONNAN \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fld %st(0); \
+ fabs; \
+ fcomps MO(flt_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+6424: fstps (%esp); \
+ flds (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4);
+#define DBL_NARROW_EVAL_UFLOW_NONNAN \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fld %st(0); \
+ fabs; \
+ fcompl MO(dbl_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+6453: fstpl (%esp); \
+ fldl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8);
+
+/* Force an underflow exception if the given value is subnormal. The
+ relevant constant for the minimum of the type must have been
+ defined, the MO macro must have been defined for access to memory
+ operands, and, if PIC, the PIC register must have been loaded. */
+#define FLT_CHECK_FORCE_UFLOW \
+ flds MO(flt_min); \
+ fld %st(1); \
+ fabs; \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4); \
+6424:
+#define DBL_CHECK_FORCE_UFLOW \
+ fldl MO(dbl_min); \
+ fld %st(1); \
+ fabs; \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8); \
+6453:
+
+/* Likewise, but also remove excess range and precision if the value
+ is subnormal. */
+#define FLT_CHECK_FORCE_UFLOW_NARROW \
+ flds MO(flt_min); \
+ fld %st(1); \
+ fabs; \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+ fstps (%esp); \
+ flds (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4); \
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NARROW \
+ fldl MO(dbl_min); \
+ fld %st(1); \
+ fabs; \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+ fstpl (%esp); \
+ fldl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8); \
+6453:
+
+/* Likewise, but the argument is nonnegative or NaN. */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN \
+ fldt MO(ldbl_min); \
+ fld %st(1); \
+ fucompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6464f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstp %st(0); \
+6464:
+
+/* Likewise, but the argument is not a NaN. */
+#define FLT_CHECK_FORCE_UFLOW_NONNAN \
+ fld %st(0); \
+ fabs; \
+ fcomps MO(flt_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4); \
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNAN \
+ fld %st(0); \
+ fabs; \
+ fcompl MO(dbl_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8); \
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNAN \
+ fldt MO(ldbl_min); \
+ fld %st(1); \
+ fabs; \
+ fcompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6464f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstp %st(0); \
+6464:
+
+/* Likewise, but the argument is nonnegative and not a NaN. */
+#define FLT_CHECK_FORCE_UFLOW_NONNEG \
+ fcoms MO(flt_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6424f; \
+ subl $4, %esp; \
+ cfi_adjust_cfa_offset (4); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstps (%esp); \
+ addl $4, %esp; \
+ cfi_adjust_cfa_offset (-4); \
+6424:
+#define DBL_CHECK_FORCE_UFLOW_NONNEG \
+ fcoml MO(dbl_min); \
+ fnstsw; \
+ sahf; \
+ jnc 6453f; \
+ subl $8, %esp; \
+ cfi_adjust_cfa_offset (8); \
+ fld %st(0); \
+ fmul %st(0); \
+ fstpl (%esp); \
+ addl $8, %esp; \
+ cfi_adjust_cfa_offset (-8); \
+6453:
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG \
+ fldt MO(ldbl_min); \
+ fld %st(1); \
+ fcompp; \
+ fnstsw; \
+ sahf; \
+ jnc 6464f; \
+ fld %st(0); \
+ fmul %st(0); \
+ fstp %st(0); \
+6464:
+
+#endif /* i386-math-asm.h. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
new file mode 100644
index 0000000000..0fc50907ad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps
@@ -0,0 +1,2202 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+float: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+float: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csinh":
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 2
+idouble: 4
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+float: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
new file mode 100644
index 0000000000..54ca0d8295
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name
@@ -0,0 +1 @@
+ix86
diff --git a/REORG.TODO/sysdeps/i386/fpu/math-tests.h b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
new file mode 100644
index 0000000000..26d0633dc0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math-tests.h
@@ -0,0 +1,27 @@
+/* Configuration for math tests. 32-bit x86 version.
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* On 32-bit x86, versions of GCC up to at least 4.8 are happy to use FPU load
+ instructions for sNaN values, and loading a float or double sNaN value will
+ already raise an INVALID exception as well as turn the sNaN into a qNaN,
+ rendering certain tests infeasible in this scenario.
+ <http://gcc.gnu.org/PR56831>. */
+#define SNAN_TESTS_float 0
+#define SNAN_TESTS_double 0
+
+#include_next <math-tests.h>
diff --git a/REORG.TODO/sysdeps/i386/fpu/math_private.h b/REORG.TODO/sysdeps/i386/fpu/math_private.h
new file mode 100644
index 0000000000..485214391f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/math_private.h
@@ -0,0 +1,7 @@
+#ifndef I386_MATH_PRIVATE_H
+#define I386_MATH_PRIVATE_H 1
+
+#include "fenv_private.h"
+#include_next <math_private.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan.c b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan2.c b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpexp.c b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpexp.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mplog.c b/REORG.TODO/sysdeps/i386/fpu/mplog.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mplog.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinh.S b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
new file mode 100644
index 0000000000..1a60f7de2c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type huge,@object
+huge: .double 1e+300
+ ASM_SIZE_DIRECTIVE(huge)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__asinh)
+ movl 8(%esp), %ecx
+ movl $0x7fffffff, %eax
+ andl %ecx, %eax
+ andl $0x80000000, %ecx
+ movl %eax, %edx
+ orl $0x800fffff, %edx
+ incl %edx
+ jz 7f // x in ±Inf or NaN
+ xorl %ecx, 8(%esp)
+ fldl 4(%esp) // |x|
+ cmpl $0x3e300000, %eax
+ jb 2f // |x| < 2^-28
+ fldln2 // log(2) : |x|
+ cmpl $0x41b00000, %eax
+ fxch // |x| : log(2)
+ ja 3f // |x| > 2^28
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x40000000, %eax
+ ja 5f // |x| > 2
+
+ // 2^-28 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+ fld %st // |x| : |x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : log(2)
+ fld %st // |x|^2 : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+ faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 6f
+ fyl2xp1
+ jecxz 4f
+ fchs
+4: ret
+
+7: fldl 4(%esp)
+ ret
+
+6: faddl MO(one)
+ fyl2x
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| < 2^-28 => y = x (inexact iff |x| != 0.0)
+ .align ALIGNARG(4)
+2:
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ jecxz 4f
+ fchs // x
+4: fld %st // x : x
+ faddl MO(huge) // huge+x : x
+ fstp %st(0) // x
+ cmpl $0x00100000, %eax
+ jae 8f
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fld %st(0)
+ fmul %st(0)
+ fstpl (%esp)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+8: ret
+
+ // |x| > 2^28 => y = sign(x) * (log(|x|) + log(2))
+ .align ALIGNARG(4)
+3: fyl2x // log(|x|)
+ fldln2 // log(2) : log(|x|)
+ faddp // log(|x|)+log(2)
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+ .align ALIGNARG(4)
+5: fld %st // |x| : |x| : log(2)
+ fadd %st, %st(1) // |x| : 2*|x| : log(2)
+ fld %st // |x| : |x| : 2*|x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+ faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+ fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+ faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+ fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+ jecxz 4f
+ fchs
+4: ret
+END(__asinh)
+weak_alias (__asinh, asinh)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
new file mode 100644
index 0000000000..12bcfef934
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S
@@ -0,0 +1,139 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type huge,@object
+huge: .double 1e+36
+ ASM_SIZE_DIRECTIVE(huge)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__asinhf)
+ movl 4(%esp), %ecx
+ movl $0x7fffffff, %eax
+ andl %ecx, %eax
+ andl $0x80000000, %ecx
+ movl %eax, %edx
+ orl $0x807fffff, %edx
+ incl %edx
+ jz 7f // x in ±Inf or NaN
+ xorl %ecx, 4(%esp)
+ flds 4(%esp) // |x|
+ cmpl $0x38000000, %eax
+ jb 2f // |x| < 2^-14
+ fldln2 // log(2) : |x|
+ cmpl $0x47000000, %eax
+ fxch // |x| : log(2)
+ ja 3f // |x| > 2^14
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x40000000, %eax
+ ja 5f // |x| > 2
+
+ // 2^-14 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+ fld %st // |x| : |x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : log(2)
+ fld %st // |x|^2 : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+ faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 6f
+ fyl2xp1
+ jecxz 4f
+ fchs
+4: ret
+
+7: flds 4(%esp)
+ ret
+
+6: faddl MO(one)
+ fyl2x
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| < 2^-14 => y = x (inexact iff |x| != 0.0)
+ .align ALIGNARG(4)
+2:
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ jecxz 4f
+ fchs // x
+4: fld %st // x : x
+ faddl MO(huge) // huge+x : x
+ fstp %st(0) // x
+ cmpl $0x00800000, %eax
+ jae 8f
+ subl $4, %esp
+ cfi_adjust_cfa_offset (4)
+ fld %st(0)
+ fmul %st(0)
+ fstps (%esp)
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+8: ret
+
+ // |x| > 2^14 => y = sign(x) * (log(|x|) + log(2))
+ .align ALIGNARG(4)
+3: fyl2x // log(|x|)
+ fldln2 // log(2) : log(|x|)
+ faddp // log(|x|)+log(2)
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+ .align ALIGNARG(4)
+5: fld %st // |x| : |x| : log(2)
+ fadd %st, %st(1) // |x| : 2*|x| : log(2)
+ fld %st // |x| : |x| : 2*|x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+ faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+ fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+ faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+ fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+ jecxz 4f
+ fchs
+4: ret
+END(__asinhf)
+weak_alias (__asinhf, asinhf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
new file mode 100644
index 0000000000..f31a267e78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S
@@ -0,0 +1,144 @@
+/* ix87 specific implementation of arcsinh.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type huge,@object
+huge: .tfloat 1e+4930
+ ASM_SIZE_DIRECTIVE(huge)
+ .align ALIGNARG(4)
+ /* Please note that we use double value for 1.0. This number
+ has an exact representation and so we don't get accuracy
+ problems. The advantage is that the code is simpler. */
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__asinhl)
+ movl 12(%esp), %ecx
+ movl $0x7fff, %eax
+ andl %ecx, %eax
+ andl $0x8000, %ecx
+ movl %eax, %edx
+ orl $0xffff8000, %edx
+ incl %edx
+ jz 7f // x in ±Inf or NaN
+ xorl %ecx, 12(%esp)
+ fldt 4(%esp) // |x|
+ cmpl $0x3fde, %eax
+ jb 2f // |x| < 2^-34
+ fldln2 // log(2) : |x|
+ cmpl $0x4020, %eax
+ fxch // |x| : log(2)
+ ja 3f // |x| > 2^34
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpl $0x4000, %eax
+ ja 5f // |x| > 2
+
+ // 2^-34 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2)))
+ fld %st // |x| : |x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : log(2)
+ fld %st // |x|^2 : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2)
+ fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2)
+ faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2)
+ fcoml MO(limit)
+ fnstsw
+ sahf
+ ja 6f
+ fyl2xp1
+ jecxz 4f
+ fchs
+4: ret
+
+7: fldt 4(%esp)
+ fadd %st
+ ret
+
+6: faddl MO(one)
+ fyl2x
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| < 2^-34 => y = x (inexact iff |x| != 0.0)
+ .align ALIGNARG(4)
+2:
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ jecxz 4f
+ fchs // x
+4: fld %st // x : x
+ fldt MO(huge) // huge : x : x
+ faddp // huge+x : x
+ fstp %st(0) // x
+ cmpl $0x0001, %eax
+ jae 8f
+ fld %st(0)
+ fmul %st(0)
+ fstp %st(0)
+8: ret
+
+ // |x| > 2^34 => y = sign(x) * (log(|x|) + log(2))
+ .align ALIGNARG(4)
+3: fyl2x // log(|x|)
+ fldln2 // log(2) : log(|x|)
+ faddp // log(|x|)+log(2)
+ jecxz 4f
+ fchs
+4: ret
+
+ // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1)))
+ .align ALIGNARG(4)
+5: fld %st // |x| : |x| : log(2)
+ fadd %st, %st(1) // |x| : 2*|x| : log(2)
+ fld %st // |x| : |x| : 2*|x| : log(2)
+ fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2)
+ faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2)
+ fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2)
+ faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2)
+ fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2)
+ faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2)
+ fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2)))
+ jecxz 4f
+ fchs
+4: ret
+END(__asinhl)
+weak_alias (__asinhl, asinhl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atan.S b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
new file mode 100644
index 0000000000..644de78feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atan.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atan.S,v 1.4 1995/05/08 23:50:41 jtc Exp $")
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__atan)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ fldl 4(%esp)
+ fld1
+ fpatan
+ DBL_CHECK_FORCE_UFLOW
+ ret
+END (__atan)
+weak_alias (__atan, atan)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanf.S b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
new file mode 100644
index 0000000000..0589c1135e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_atanf.S,v 1.3 1995/05/08 23:51:33 jtc Exp $")
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%ecx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__atanf)
+#ifdef PIC
+ LOAD_PIC_REG (cx)
+#endif
+ flds 4(%esp)
+ fld1
+ fpatan
+ FLT_CHECK_FORCE_UFLOW
+ ret
+END (__atanf)
+weak_alias (__atanf, atanf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanl.c b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
new file mode 100644
index 0000000000..b7dba88aad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c
@@ -0,0 +1,22 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <math_private.h>
+
+long double
+__atanl (long double x)
+{
+ long double res;
+
+ asm ("fld1\n"
+ "fpatan"
+ : "=t" (res) : "0" (x));
+
+ return res;
+}
+
+weak_alias (__atanl, atanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
new file mode 100644
index 0000000000..7f01659eae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S
@@ -0,0 +1,200 @@
+/* Compute cubic root of double value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+ Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type f7,@object
+f7: .double -0.145263899385486377
+ ASM_SIZE_DIRECTIVE(f7)
+ .type f6,@object
+f6: .double 0.784932344976639262
+ ASM_SIZE_DIRECTIVE(f6)
+ .type f5,@object
+f5: .double -1.83469277483613086
+ ASM_SIZE_DIRECTIVE(f5)
+ .type f4,@object
+f4: .double 2.44693122563534430
+ ASM_SIZE_DIRECTIVE(f4)
+ .type f3,@object
+f3: .double -2.11499494167371287
+ ASM_SIZE_DIRECTIVE(f3)
+ .type f2,@object
+f2: .double 1.50819193781584896
+ ASM_SIZE_DIRECTIVE(f2)
+ .type f1,@object
+f1: .double 0.354895765043919860
+ ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2 1.2599210498948731648
+#define ONE_CBRT2 0.793700525984099737355196796584
+#define SQR_CBRT2 1.5874010519681994748
+#define ONE_SQR_CBRT2 0.629960524947436582364439673883
+
+ .type factor,@object
+factor: .double ONE_SQR_CBRT2
+ .double ONE_CBRT2
+ .double 1.0
+ .double CBRT2
+ .double SQR_CBRT2
+ ASM_SIZE_DIRECTIVE(factor)
+
+ .type two54,@object
+two54: .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+ ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+ .text
+ENTRY(__cbrt)
+ movl 4(%esp), %ecx
+ movl 8(%esp), %eax
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+ orl %eax, %ecx
+ jz 1f
+ xorl %ecx, %ecx
+ cmpl $0x7ff00000, %eax
+ jae 1f
+
+#ifdef PIC
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ LOAD_PIC_REG (bx)
+#endif
+
+ cmpl $0x00100000, %eax
+ jae 2f
+
+#ifdef PIC
+ fldl 8(%esp)
+#else
+ fldl 4(%esp)
+#endif
+ fmull MO(two54)
+ movl $-54, %ecx
+#ifdef PIC
+ fstpl 8(%esp)
+ movl 12(%esp), %eax
+#else
+ fstpl 4(%esp)
+ movl 8(%esp), %eax
+#endif
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+
+2: shrl $20, %eax
+ andl $0x800fffff, %edx
+ subl $1022, %eax
+ orl $0x3fe00000, %edx
+ addl %eax, %ecx
+#ifdef PIC
+ movl %edx, 12(%esp)
+
+ fldl 8(%esp) /* xm */
+#else
+ movl %edx, 8(%esp)
+
+ fldl 4(%esp) /* xm */
+#endif
+ fabs
+
+ /* The following code has two tracks:
+ a) compute the normalized cbrt value
+ b) compute xe/3 and xe%3
+ The right track computes the value for b) and this is done
+ in an optimized way by avoiding division.
+
+ But why two tracks at all? Very easy: efficiency. Some FP
+ instruction can overlap with a certain amount of integer (and
+ FP) instructions. So we get (except for the imull) all
+ instructions for free. */
+
+ fld %st(0) /* xm : xm */
+
+ fmull MO(f7) /* f7*xm : xm */
+ movl $1431655766, %eax
+ faddl MO(f6) /* f6+f7*xm : xm */
+ imull %ecx
+ fmul %st(1) /* (f6+f7*xm)*xm : xm */
+ movl %ecx, %eax
+ faddl MO(f5) /* f5+(f6+f7*xm)*xm : xm */
+ sarl $31, %eax
+ fmul %st(1) /* (f5+(f6+f7*xm)*xm)*xm : xm */
+ subl %eax, %edx
+ faddl MO(f4) /* f4+(f5+(f6+f7*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+ faddl MO(f3) /* f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+ faddl MO(f2) /* f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+ faddl MO(f1) /* u:=f1+(f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+ fld %st /* u : u : xm */
+ fmul %st(1) /* u*u : u : xm */
+ fld %st(2) /* xm : u*u : u : xm */
+ fadd %st /* 2*xm : u*u : u : xm */
+ fxch %st(1) /* u*u : 2*xm : u : xm */
+ fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */
+ movl %edx, %eax
+ fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */
+ leal (%edx,%edx,2),%edx
+ fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */
+ subl %edx, %ecx
+ faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */
+ shll $3, %ecx
+ fmulp /* u*(t2+2*xm) : 2*t2+xm */
+ fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */
+ fmull MOX(16+factor,%ecx) /* u*(t2+2*xm)/(2*t2+xm)*FACT */
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+ fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+ fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+ movl 12(%esp), %eax
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+#else
+ movl 8(%esp), %eax
+#endif
+ testl %eax, %eax
+ fstp %st(1)
+ jns 4f
+ fchs
+4: ret
+
+ /* Return the argument. */
+1: fldl 4(%esp)
+ ret
+END(__cbrt)
+weak_alias (__cbrt, cbrt)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
new file mode 100644
index 0000000000..645d24372d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S
@@ -0,0 +1,177 @@
+/* Compute cubic root of float value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+ Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type f3,@object
+f3: .double 0.191502161678719066
+ ASM_SIZE_DIRECTIVE(f3)
+ .type f2,@object
+f2: .double 0.697570460207922770
+ ASM_SIZE_DIRECTIVE(f2)
+ .type f1,@object
+f1: .double 0.492659620528969547
+ ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2 1.2599210498948731648
+#define ONE_CBRT2 0.793700525984099737355196796584
+#define SQR_CBRT2 1.5874010519681994748
+#define ONE_SQR_CBRT2 0.629960524947436582364439673883
+
+ .type factor,@object
+ .align ALIGNARG(4)
+factor: .double ONE_SQR_CBRT2
+ .double ONE_CBRT2
+ .double 1.0
+ .double CBRT2
+ .double SQR_CBRT2
+ ASM_SIZE_DIRECTIVE(factor)
+
+ .type two25,@object
+two25: .byte 0, 0, 0, 0x4c
+ ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+ .text
+ENTRY(__cbrtf)
+ movl 4(%esp), %eax
+ xorl %ecx, %ecx
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+ jz 1f
+ cmpl $0x7f800000, %eax
+ jae 1f
+
+#ifdef PIC
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ LOAD_PIC_REG (bx)
+#endif
+
+ cmpl $0x00800000, %eax
+ jae 2f
+
+#ifdef PIC
+ flds 8(%esp)
+#else
+ flds 4(%esp)
+#endif
+ fmuls MO(two25)
+ movl $-25, %ecx
+#ifdef PIC
+ fstps 8(%esp)
+ movl 8(%esp), %eax
+#else
+ fstps 4(%esp)
+ movl 4(%esp), %eax
+#endif
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+
+2: shrl $23, %eax
+ andl $0x807fffff, %edx
+ subl $126, %eax
+ orl $0x3f000000, %edx
+ addl %eax, %ecx
+#ifdef PIC
+ movl %edx, 8(%esp)
+
+ flds 8(%esp) /* xm */
+#else
+ movl %edx, 4(%esp)
+
+ flds 4(%esp) /* xm */
+#endif
+ fabs
+
+ /* The following code has two tracks:
+ a) compute the normalized cbrt value
+ b) compute xe/3 and xe%3
+ The right track computes the value for b) and this is done
+ in an optimized way by avoiding division.
+
+ But why two tracks at all? Very easy: efficiency. Some FP
+ instruction can overlap with a certain amount of integer (and
+ FP) instructions. So we get (except for the imull) all
+ instructions for free. */
+
+ fld %st(0) /* xm : xm */
+ fmull MO(f3) /* f3*xm : xm */
+ movl $1431655766, %eax
+ fsubrl MO(f2) /* f2-f3*xm : xm */
+ imull %ecx
+ fmul %st(1) /* (f2-f3*xm)*xm : xm */
+ movl %ecx, %eax
+ faddl MO(f1) /* u:=f1+(f2-f3*xm)*xm : xm */
+ sarl $31, %eax
+ fld %st /* u : u : xm */
+ subl %eax, %edx
+ fmul %st(1) /* u*u : u : xm */
+ fld %st(2) /* xm : u*u : u : xm */
+ fadd %st /* 2*xm : u*u : u : xm */
+ fxch %st(1) /* u*u : 2*xm : u : xm */
+ fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */
+ movl %edx, %eax
+ fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */
+ leal (%edx,%edx,2),%edx
+ fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */
+ subl %edx, %ecx
+ faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */
+ shll $3, %ecx
+ fmulp /* u*(t2+2*xm) : 2*t2+xm */
+ fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */
+ fmull MOX(16+factor,%ecx) /* u*(t2+2*xm)/(2*t2+xm)*FACT */
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+ fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+ fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+ movl 8(%esp), %eax
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+#else
+ movl 4(%esp), %eax
+#endif
+ testl %eax, %eax
+ fstp %st(1)
+ jns 4f
+ fchs
+4: ret
+
+ /* Return the argument. */
+1: flds 4(%esp)
+ ret
+END(__cbrtf)
+weak_alias (__cbrtf, cbrtf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
new file mode 100644
index 0000000000..e4a72d29c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S
@@ -0,0 +1,229 @@
+/* Compute cubic root of long double value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
+ Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type f8,@object
+f8: .tfloat 0.161617097923756032
+ ASM_SIZE_DIRECTIVE(f8)
+ .align ALIGNARG(4)
+ .type f7,@object
+f7: .tfloat -0.988553671195413709
+ ASM_SIZE_DIRECTIVE(f7)
+ .align ALIGNARG(4)
+ .type f6,@object
+f6: .tfloat 2.65298938441952296
+ ASM_SIZE_DIRECTIVE(f6)
+ .align ALIGNARG(4)
+ .type f5,@object
+f5: .tfloat -4.11151425200350531
+ ASM_SIZE_DIRECTIVE(f5)
+ .align ALIGNARG(4)
+ .type f4,@object
+f4: .tfloat 4.09559907378707839
+ ASM_SIZE_DIRECTIVE(f4)
+ .align ALIGNARG(4)
+ .type f3,@object
+f3: .tfloat -2.82414939754975962
+ ASM_SIZE_DIRECTIVE(f3)
+ .align ALIGNARG(4)
+ .type f2,@object
+f2: .tfloat 1.67595307700780102
+ ASM_SIZE_DIRECTIVE(f2)
+ .align ALIGNARG(4)
+ .type f1,@object
+f1: .tfloat 0.338058687610520237
+ ASM_SIZE_DIRECTIVE(f1)
+
+#define CBRT2 1.2599210498948731648
+#define ONE_CBRT2 0.793700525984099737355196796584
+#define SQR_CBRT2 1.5874010519681994748
+#define ONE_SQR_CBRT2 0.629960524947436582364439673883
+
+ /* We make the entries in the following table all 16 bytes
+ wide to avoid having to implement a multiplication by 10. */
+ .type factor,@object
+ .align ALIGNARG(4)
+factor: .tfloat ONE_SQR_CBRT2
+ .byte 0, 0, 0, 0, 0, 0
+ .tfloat ONE_CBRT2
+ .byte 0, 0, 0, 0, 0, 0
+ .tfloat 1.0
+ .byte 0, 0, 0, 0, 0, 0
+ .tfloat CBRT2
+ .byte 0, 0, 0, 0, 0, 0
+ .tfloat SQR_CBRT2
+ ASM_SIZE_DIRECTIVE(factor)
+
+ .type two64,@object
+ .align ALIGNARG(4)
+two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+ ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%ebx)
+#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
+#else
+#define MO(op) op
+#define MOX(op,x) op(x)
+#endif
+
+ .text
+ENTRY(__cbrtl)
+ movl 4(%esp), %ecx
+ movl 12(%esp), %eax
+ orl 8(%esp), %ecx
+ movl %eax, %edx
+ andl $0x7fff, %eax
+ orl %eax, %ecx
+ jz 1f
+ xorl %ecx, %ecx
+ cmpl $0x7fff, %eax
+ je 1f
+
+#ifdef PIC
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ LOAD_PIC_REG (bx)
+#endif
+
+ cmpl $0, %eax
+ jne 2f
+
+#ifdef PIC
+ fldt 8(%esp)
+#else
+ fldt 4(%esp)
+#endif
+ fmull MO(two64)
+ movl $-64, %ecx
+#ifdef PIC
+ fstpt 8(%esp)
+ movl 16(%esp), %eax
+#else
+ fstpt 4(%esp)
+ movl 12(%esp), %eax
+#endif
+ movl %eax, %edx
+ andl $0x7fff, %eax
+
+2: andl $0x8000, %edx
+ subl $16382, %eax
+ orl $0x3ffe, %edx
+ addl %eax, %ecx
+#ifdef PIC
+ movl %edx, 16(%esp)
+
+ fldt 8(%esp) /* xm */
+#else
+ movl %edx, 12(%esp)
+
+ fldt 4(%esp) /* xm */
+#endif
+ fabs
+
+ /* The following code has two tracks:
+ a) compute the normalized cbrt value
+ b) compute xe/3 and xe%3
+ The right track computes the value for b) and this is done
+ in an optimized way by avoiding division.
+
+ But why two tracks at all? Very easy: efficiency. Some FP
+ instruction can overlap with a certain amount of integer (and
+ FP) instructions. So we get (except for the imull) all
+ instructions for free. */
+
+ fldt MO(f8) /* f8 : xm */
+ fmul %st(1) /* f8*xm : xm */
+
+ fldt MO(f7)
+ faddp /* f7+f8*xm : xm */
+ fmul %st(1) /* (f7+f8*xm)*xm : xm */
+ movl $1431655766, %eax
+ fldt MO(f6)
+ faddp /* f6+(f7+f8*xm)*xm : xm */
+ imull %ecx
+ fmul %st(1) /* (f6+(f7+f8*xm)*xm)*xm : xm */
+ movl %ecx, %eax
+ fldt MO(f5)
+ faddp /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
+ sarl $31, %eax
+ fmul %st(1) /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+ subl %eax, %edx
+ fldt MO(f4)
+ faddp /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+ fldt MO(f3)
+ faddp /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+ fldt MO(f2)
+ faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
+ fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+ fldt MO(f1)
+ faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
+
+ fld %st /* u : u : xm */
+ fmul %st(1) /* u*u : u : xm */
+ fld %st(2) /* xm : u*u : u : xm */
+ fadd %st /* 2*xm : u*u : u : xm */
+ fxch %st(1) /* u*u : 2*xm : u : xm */
+ fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */
+ movl %edx, %eax
+ fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */
+ leal (%edx,%edx,2),%edx
+ fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */
+ subl %edx, %ecx
+ faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */
+ shll $4, %ecx
+ fmulp /* u*(t2+2*xm) : 2*t2+xm */
+ fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */
+ fldt MOX(32+factor,%ecx)
+ fmulp /* u*(t2+2*xm)/(2*t2+xm)*FACT */
+ pushl %eax
+ cfi_adjust_cfa_offset (4)
+ fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
+ fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
+ fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+#ifdef PIC
+ movl 16(%esp), %eax
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+#else
+ movl 12(%esp), %eax
+#endif
+ testl $0x8000, %eax
+ fstp %st(1)
+ jz 4f
+ fchs
+4: ret
+
+ /* Return the argument. */
+1: fldt 4(%esp)
+ fadd %st
+ ret
+END(__cbrtl)
+weak_alias (__cbrtl, cbrtl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceil.S b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
new file mode 100644
index 0000000000..1226bb2f87
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceil.S,v 1.4 1995/05/08 23:52:13 jtc Exp $")
+
+ENTRY(__ceil)
+ fldl 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x0800,%edx /* round towards +oo */
+ orl 4(%esp),%edx
+ andl $0xfbff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__ceil)
+weak_alias (__ceil, ceil)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
new file mode 100644
index 0000000000..d345c0973b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_ceilf.S,v 1.3 1995/05/08 23:52:44 jtc Exp $")
+
+ENTRY(__ceilf)
+ flds 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x0800,%edx /* round towards +oo */
+ orl 4(%esp),%edx
+ andl $0xfbff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__ceilf)
+weak_alias (__ceilf, ceilf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceill.S b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
new file mode 100644
index 0000000000..7c08f43b24
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__ceill)
+ fldt 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x0800,%edx /* round towards +oo */
+ orl 4(%esp),%edx
+ andl $0xfbff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ /* Preserve "invalid" exceptions from sNaN input. */
+ fnstsw
+ andl $0x1, %eax
+ orl %eax, 8(%esp)
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__ceill)
+weak_alias (__ceill, ceill)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysign.S b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
new file mode 100644
index 0000000000..2520a94427
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysign.S,v 1.4 1995/05/08 23:53:02 jtc Exp $")
+
+ENTRY(__copysign)
+ movl 16(%esp),%edx
+ movl 8(%esp),%eax
+ andl $0x80000000,%edx
+ andl $0x7fffffff,%eax
+ orl %edx,%eax
+ movl %eax,8(%esp)
+ fldl 4(%esp)
+ ret
+END (__copysign)
+weak_alias (__copysign, copysign)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
new file mode 100644
index 0000000000..57b1a6f119
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_copysignf.S,v 1.3 1995/05/08 23:53:25 jtc Exp $")
+
+ENTRY(__copysignf)
+ movl 8(%esp),%edx
+ movl 4(%esp),%eax
+ andl $0x80000000,%edx
+ andl $0x7fffffff,%eax
+ orl %edx,%eax
+ movl %eax,4(%esp)
+ flds 4(%esp)
+ ret
+END (__copysignf)
+weak_alias (__copysignf, copysignf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
new file mode 100644
index 0000000000..2163e7b014
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S
@@ -0,0 +1,21 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__copysignl)
+ movl 24(%esp),%edx
+ movl 12(%esp),%eax
+ andl $0x8000,%edx
+ andl $0x7fff,%eax
+ orl %edx,%eax
+ movl %eax,12(%esp)
+ fldt 4(%esp)
+ ret
+END (__copysignl)
+weak_alias (__copysignl, copysignl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
new file mode 100644
index 0000000000..59fded2d5a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+ Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+ Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+ /* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type minus1,@object
+minus1: .double -1.0
+ ASM_SIZE_DIRECTIVE(minus1)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type l2e,@object
+l2e: .tfloat 1.442695040888963407359924681002
+ ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__expm1)
+ movzwl 4+6(%esp), %eax
+ xorb $0x80, %ah // invert sign bit (now 1 is "positive")
+ cmpl $0xc086, %eax // is num >= 704?
+ jae HIDDEN_JUMPTARGET (__exp)
+
+ fldl 4(%esp) // x
+ fxam // Is NaN, +-Inf or +-0?
+ xorb $0x80, %ah
+ cmpl $0xc043, %eax // is num <= -38.0?
+ fstsw %ax
+ movb $0x45, %ch
+ jb 4f
+
+ // Below -38.0 (may be -NaN or -Inf).
+ andb %ah, %ch
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpb $0x01, %ch
+ je 5f // If -NaN, jump.
+ jmp 2f // -large, possibly -Inf.
+
+4: // In range -38.0 to 704.0 (may be +-0 but not NaN or +-Inf).
+ andb %ah, %ch
+ cmpb $0x40, %ch
+ je 3f // If +-0, jump.
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+5: fldt MO(l2e) // log2(e) : x
+ fmulp // log2(e)*x
+ fld %st // log2(e)*x : log2(e)*x
+ // Set round-to-nearest temporarily.
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fstcw 4(%esp)
+ movl $0xf3ff, %ecx
+ andl 4(%esp), %ecx
+ movl %ecx, (%esp)
+ fldcw (%esp)
+ frndint // int(log2(e)*x) : log2(e)*x
+ fldcw 4(%esp)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fsubr %st, %st(1) // int(log2(e)*x) : fract(log2(e)*x)
+ fxch // fract(log2(e)*x) : int(log2(e)*x)
+ f2xm1 // 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+ fscale // 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+ fxch // int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fldl MO(one) // 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fscale // 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fsubrl MO(one) // 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fstp %st(1) // 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fsubrp %st, %st(1) // 2^(log2(e)*x)
+ DBL_CHECK_FORCE_UFLOW
+ ret
+
+2: fstp %st
+ fldl MO(minus1) // Set result to -1.0.
+3: ret
+END(__expm1)
+weak_alias (__expm1, expm1)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
new file mode 100644
index 0000000000..4f0b2e7832
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S
@@ -0,0 +1,113 @@
+/* ix87 specific implementation of exp(x)-1.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+ Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>.
+ Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+ /* Using: e^x - 1 = 2^(x * log2(e)) - 1 */
+
+#include <sysdep.h>
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type minus1,@object
+minus1: .double -1.0
+ ASM_SIZE_DIRECTIVE(minus1)
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ .type l2e,@object
+l2e: .tfloat 1.442695040888963407359924681002
+ ASM_SIZE_DIRECTIVE(l2e)
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+ .text
+ENTRY(__expm1f)
+ movzwl 4+2(%esp), %eax
+ xorb $0x80, %ah // invert sign bit (now 1 is "positive")
+ cmpl $0xc2b1, %eax // is num >= 88.5?
+ jae HIDDEN_JUMPTARGET (__expf)
+
+ flds 4(%esp) // x
+ fxam // Is NaN, +-Inf or +-0?
+ xorb $0x80, %ah
+ cmpl $0xc190, %eax // is num <= -18.0?
+ fstsw %ax
+ movb $0x45, %ch
+ jb 4f
+
+ // Below -18.0 (may be -NaN or -Inf).
+ andb %ah, %ch
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ cmpb $0x01, %ch
+ je 5f // If -NaN, jump.
+ jmp 2f // -large, possibly -Inf.
+
+4: // In range -18.0 to 88.5 (may be +-0 but not NaN or +-Inf).
+ andb %ah, %ch
+ cmpb $0x40, %ch
+ je 3f // If +-0, jump.
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+5: fldt MO(l2e) // log2(e) : x
+ fmulp // log2(e)*x
+ fld %st // log2(e)*x : log2(e)*x
+ // Set round-to-nearest temporarily.
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fstcw 4(%esp)
+ movl $0xf3ff, %ecx
+ andl 4(%esp), %ecx
+ movl %ecx, (%esp)
+ fldcw (%esp)
+ frndint // int(log2(e)*x) : log2(e)*x
+ fldcw 4(%esp)
+ addl $8, %esp
+ cfi_adjust_cfa_offset (-8)
+ fsubr %st, %st(1) // int(log2(e)*x) : fract(log2(e)*x)
+ fxch // fract(log2(e)*x) : int(log2(e)*x)
+ f2xm1 // 2^fract(log2(e)*x)-1 : int(log2(e)*x)
+ fscale // 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x)
+ fxch // int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fldl MO(one) // 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fscale // 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fsubrl MO(one) // 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fstp %st(1) // 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x)
+ fsubrp %st, %st(1) // 2^(log2(e)*x)
+ FLT_CHECK_FORCE_UFLOW
+ ret
+
+2: fstp %st
+ fldl MO(minus1) // Set result to -1.0.
+3: ret
+END(__expm1f)
+weak_alias (__expm1f, expm1f)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
new file mode 100644
index 0000000000..7fbd99b0db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S
@@ -0,0 +1,2 @@
+#define USE_AS_EXPM1L
+#include <e_expl.S>
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabs.S b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
new file mode 100644
index 0000000000..23ae9dccb9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+ .text
+ENTRY(__fabs)
+ fldl 4(%esp)
+ fabs
+ ret
+END(__fabs)
+weak_alias (__fabs, fabs)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
new file mode 100644
index 0000000000..c0407a8839
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+ .text
+ENTRY(__fabsf)
+ flds 4(%esp)
+ fabs
+ ret
+END(__fabsf)
+weak_alias (__fabsf, fabsf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
new file mode 100644
index 0000000000..a12a3e050b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S
@@ -0,0 +1,9 @@
+#include <sysdep.h>
+
+ .text
+ENTRY(__fabsl)
+ fldt 4(%esp)
+ fabs
+ ret
+END(__fabsl)
+weak_alias (__fabsl, fabsl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fdim.c b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
new file mode 100644
index 0000000000..6243c62998
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c
@@ -0,0 +1,50 @@
+/* Return positive difference between arguments. i386 version.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <fpu_control.h>
+#include <math.h>
+#include <math_private.h>
+
+double
+__fdim (double x, double y)
+{
+ if (islessequal (x, y))
+ return 0.0;
+
+ /* To avoid double rounding, set double precision for the
+ subtraction. math_narrow_eval is still needed to eliminate
+ excess range in the case of overflow. If the result of the
+ subtraction is in the subnormal range for double, it is exact, so
+ no issues of double rounding for subnormals arise. */
+ fpu_control_t cw, cw_double;
+ _FPU_GETCW (cw);
+ cw_double = (cw & ~_FPU_EXTENDED) | _FPU_DOUBLE;
+ _FPU_SETCW (cw_double);
+ double r = math_narrow_eval (x - y);
+ _FPU_SETCW (cw);
+ if (isinf (r) && !isinf (x) && !isinf (y))
+ __set_errno (ERANGE);
+
+ return r;
+}
+weak_alias (__fdim, fdim)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__fdim, __fdiml)
+weak_alias (__fdim, fdiml)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finite.S b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
new file mode 100644
index 0000000000..1ae4aed451
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finite.S
@@ -0,0 +1,17 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finite)
+ movl 8(%esp),%eax
+ movl $0xFFEFFFFF,%ecx
+ subl %eax,%ecx
+ xorl %ecx,%eax
+ shrl $31, %eax
+ ret
+END (__finite)
+weak_alias (__finite, finite)
+hidden_def (__finite)
+
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitef.S b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
new file mode 100644
index 0000000000..69e72facff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S
@@ -0,0 +1,16 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitef)
+ movl 4(%esp),%eax
+ movl $0xFF7FFFFF,%ecx
+ subl %eax,%ecx
+ xorl %ecx,%eax
+ shrl $31,%eax
+ ret
+END (__finitef)
+weak_alias (__finitef, finitef)
+hidden_def (__finitef)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitel.S b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
new file mode 100644
index 0000000000..cce90e18fc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S
@@ -0,0 +1,15 @@
+/*
+ * Written by Joe Keane <jgk@jgk.org>.
+ */
+
+#include <machine/asm.h>
+
+ENTRY(__finitel)
+ movl 12(%esp),%eax
+ orl $0xffff8000, %eax
+ incl %eax
+ shrl $31, %eax
+ ret
+END (__finitel)
+weak_alias (__finitel, finitel)
+hidden_def (__finitel)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floor.S b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
new file mode 100644
index 0000000000..ed837dae40
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floor.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floor.S,v 1.4 1995/05/09 00:01:59 jtc Exp $")
+
+ENTRY(__floor)
+ fldl 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x400,%edx /* round towards -oo */
+ orl 4(%esp),%edx
+ andl $0xf7ff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__floor)
+weak_alias (__floor, floor)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorf.S b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
new file mode 100644
index 0000000000..84b6f7ed99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S
@@ -0,0 +1,34 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_floorf.S,v 1.3 1995/05/09 00:04:32 jtc Exp $")
+
+ENTRY(__floorf)
+ flds 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x400,%edx /* round towards -oo */
+ orl 4(%esp),%edx
+ andl $0xf7ff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__floorf)
+weak_alias (__floorf, floorf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorl.S b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
new file mode 100644
index 0000000000..dc74a0c446
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S
@@ -0,0 +1,40 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__floorl)
+ fldt 4(%esp)
+ subl $32,%esp
+ cfi_adjust_cfa_offset (32)
+
+ fnstenv 4(%esp) /* store fpu environment */
+
+ /* We use here %edx although only the low 1 bits are defined.
+ But none of the operations should care and they are faster
+ than the 16 bit operations. */
+ movl $0x400,%edx /* round towards -oo */
+ orl 4(%esp),%edx
+ andl $0xf7ff,%edx
+ movl %edx,(%esp)
+ fldcw (%esp) /* load modified control word */
+
+ frndint /* round */
+
+ /* Preserve "invalid" exceptions from sNaN input. */
+ fnstsw
+ andl $0x1, %eax
+ orl %eax, 8(%esp)
+
+ fldenv 4(%esp) /* restore original environment */
+
+ addl $32,%esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__floorl)
+weak_alias (__floorl, floorl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
new file mode 100644
index 0000000000..218dcef421
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmax)
+ fldl 12(%esp) // y
+ fxam
+ fnstsw
+ fldl 4(%esp) // y : x
+
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 1f // y == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jnc 1f
+
+ fxch %st(1)
+1: fstp %st(1)
+
+ ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..b7a00cefeb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S
@@ -0,0 +1,43 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxf)
+ flds 8(%esp) // y
+ fxam
+ fnstsw
+ flds 4(%esp) // y : x
+
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 1f // y == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jnc 1f
+
+ fxch %st(1)
+1: fstp %st(1)
+
+ ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..68162921db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S
@@ -0,0 +1,71 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxl)
+ fldt 16(%esp) // y
+ fxam
+ fnstsw
+ fldt 4(%esp) // y : x
+
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 2f // y == NaN
+
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 3f // x == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jnc 1f
+
+ fxch %st(1)
+1: fstp %st(1)
+
+ ret
+
+2: // st(1) is a NaN; st(0) may or may not be.
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) is not. Test if st(0) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(0)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
new file mode 100644
index 0000000000..a5bb0e06dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmin)
+ fldl 4(%esp) // x
+ fldl 12(%esp) // x : y
+
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 1f // y == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jc 2f
+
+1: fxch %st(1)
+2: fstp %st(1)
+
+ ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
new file mode 100644
index 0000000000..fba4a41120
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S
@@ -0,0 +1,43 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminf)
+ flds 4(%esp) // x
+ flds 8(%esp) // x : y
+
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 1f // y == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jc 2f
+
+1: fxch %st(1)
+2: fstp %st(1)
+
+ ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
new file mode 100644
index 0000000000..12ef21fda9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S
@@ -0,0 +1,71 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminl)
+ fldt 16(%esp) // y
+ fxam
+ fnstsw
+ fldt 4(%esp) // y : x
+
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 2f // y == NaN
+
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 3f // x == NaN
+
+ fucom %st(1)
+ fnstsw
+ sahf
+ jc 1f
+
+ fxch %st(1)
+1: fstp %st(1)
+
+ ret
+
+2: // st(1) is a NaN; st(0) may or may not be.
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x01, %ah
+ je 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) is not. Test if st(0) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(0)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
new file mode 100644
index 0000000000..ce19fd0035
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c
@@ -0,0 +1,42 @@
+/* Return classification value corresponding to argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <math.h>
+
+#include <math_private.h>
+
+
+int
+__fpclassifyl (long double x)
+{
+ u_int32_t ex, hx, lx;
+ int retval = FP_NORMAL;
+
+ GET_LDOUBLE_WORDS (ex, hx, lx, x);
+ ex &= 0x7fff;
+ if ((ex | lx | hx) == 0)
+ retval = FP_ZERO;
+ else if (ex == 0 && (hx & 0x80000000) == 0)
+ retval = FP_SUBNORMAL;
+ else if (ex == 0x7fff)
+ retval = ((hx & 0x7fffffff) | lx) != 0 ? FP_NAN : FP_INFINITE;
+
+ return retval;
+}
+libm_hidden_def (__fpclassifyl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexp.S b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
new file mode 100644
index 0000000000..104f733bf6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S
@@ -0,0 +1,83 @@
+/* ix87 specific frexp implementation for double.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type two54,@object
+two54: .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43
+ ASM_SIZE_DIRECTIVE(two54)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS 4 /* no space for saved regs */
+#define VAL0 PARMS
+#define VAL1 VAL0+4
+#define EXPP VAL1+4
+
+ .text
+ENTRY (__frexp)
+
+ movl VAL0(%esp), %ecx
+ movl VAL1(%esp), %eax
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+ orl %eax, %ecx
+ jz 1f
+ xorl %ecx, %ecx
+ cmpl $0x7ff00000, %eax
+ jae 1f
+
+ cmpl $0x00100000, %eax
+ jae 2f
+
+ fldl VAL0(%esp)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fmull MO(two54)
+ movl $-54, %ecx
+ fstpl VAL0(%esp)
+ fwait
+ movl VAL1(%esp), %eax
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+
+2: shrl $20, %eax
+ andl $0x800fffff, %edx
+ subl $1022, %eax
+ orl $0x3fe00000, %edx
+ addl %eax, %ecx
+ movl %edx, VAL1(%esp)
+
+ /* Store %ecx in the variable pointed to by the second argument,
+ get the factor from the stack and return. */
+1: movl EXPP(%esp), %eax
+ fldl VAL0(%esp)
+ movl %ecx, (%eax)
+
+ ret
+END (__frexp)
+weak_alias (__frexp, frexp)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
new file mode 100644
index 0000000000..f21c39ec4b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S
@@ -0,0 +1,80 @@
+/* ix87 specific frexp implementation for float.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type two25,@object
+two25: .byte 0, 0, 0, 0x4c
+ ASM_SIZE_DIRECTIVE(two25)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS 4 /* no space for saved regs */
+#define VAL PARMS
+#define EXPP VAL+4
+
+ .text
+ENTRY (__frexpf)
+
+ movl VAL(%esp), %eax
+ xorl %ecx, %ecx
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+ jz 1f
+ cmpl $0x7f800000, %eax
+ jae 1f
+
+ cmpl $0x00800000, %eax
+ jae 2f
+
+ flds VAL(%esp)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fmuls MO(two25)
+ movl $-25, %ecx
+ fstps VAL(%esp)
+ fwait
+ movl VAL(%esp), %eax
+ movl %eax, %edx
+ andl $0x7fffffff, %eax
+
+2: shrl $23, %eax
+ andl $0x807fffff, %edx
+ subl $126, %eax
+ orl $0x3f000000, %edx
+ addl %eax, %ecx
+ movl %edx, VAL(%esp)
+
+ /* Store %ecx in the variable pointed to by the second argument,
+ get the factor from the stack and return. */
+1: movl EXPP(%esp), %eax
+ flds VAL(%esp)
+ movl %ecx, (%eax)
+
+ ret
+END (__frexpf)
+weak_alias (__frexpf, frexpf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
new file mode 100644
index 0000000000..04f28888d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S
@@ -0,0 +1,92 @@
+/* ix87 specific frexp implementation for long double.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ .type two64,@object
+two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
+ ASM_SIZE_DIRECTIVE(two64)
+
+#ifdef PIC
+#define MO(op) op##@GOTOFF(%edx)
+#else
+#define MO(op) op
+#endif
+
+#define PARMS 4 /* no space for saved regs */
+#define VAL0 PARMS
+#define VAL1 VAL0+4
+#define VAL2 VAL1+4
+#define EXPP VAL2+4
+
+ .text
+ENTRY (__frexpl)
+
+ movl VAL0(%esp), %ecx
+ movl VAL2(%esp), %eax
+ orl VAL1(%esp), %ecx
+ movl %eax, %edx
+ andl $0x7fff, %eax
+ orl %eax, %ecx
+ jz 1f
+ xorl %ecx, %ecx
+ cmpl $0x7fff, %eax
+ je 3f
+
+ cmpl $0, %eax
+ jne 2f
+
+ fldt VAL0(%esp)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ fmull MO(two64) /* It's not necessary to use a 80bit factor */
+ movl $-64, %ecx
+ fstpt VAL0(%esp)
+ fwait
+ movl VAL2(%esp), %eax
+ movl %eax, %edx
+ andl $0x7fff, %eax
+
+2: andl $0x8000, %edx
+ subl $16382, %eax
+ orl $0x3ffe, %edx
+ addl %eax, %ecx
+ movl %edx, VAL2(%esp)
+
+ /* Store %ecx in the variable pointed to by the second argument,
+ get the factor from the stack and return. */
+1: movl EXPP(%esp), %eax
+ fldt VAL0(%esp)
+ movl %ecx, (%eax)
+
+ ret
+
+ /* Infinity or NaN; ensure signaling NaNs are quieted. */
+3: movl EXPP(%esp), %eax
+ fldt VAL0(%esp)
+ fadd %st
+ movl %ecx, (%eax)
+ ret
+END (__frexpl)
+weak_alias (__frexpl, frexpl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
new file mode 100644
index 0000000000..cdd77183fa
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c
@@ -0,0 +1,32 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Change for long double by Ulrich Drepper <drepper@cygnus.com>.
+ * Intel i387 specific version.
+ * Public domain.
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isinfl(long double x)
+{
+ int32_t se,hx,lx;
+ GET_LDOUBLE_WORDS(se,hx,lx,x);
+ /* This additional ^ 0x80000000 is necessary because in Intel's
+ internal representation of the implicit one is explicit. */
+ lx |= (hx ^ 0x80000000) | ((se & 0x7fff) ^ 0x7fff);
+ lx |= -lx;
+ se &= 0x8000;
+ return ~(lx >> 31) & (1 - (se >> 14));
+}
+hidden_def (__isinfl)
+weak_alias (__isinfl, isinfl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
new file mode 100644
index 0000000000..816396d8fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c
@@ -0,0 +1,43 @@
+/* s_isnanl.c -- long double version for i387 of s_isnan.c.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/*
+ * isnanl(x) returns 1 is x is nan, else 0;
+ * no branching!
+ */
+
+#include <math.h>
+#include <math_private.h>
+
+int __isnanl(long double x)
+{
+ int32_t se,hx,lx;
+ GET_LDOUBLE_WORDS(se,hx,lx,x);
+ se = (se & 0x7fff) << 1;
+ /* The additional & 0x7fffffff is required because Intel's
+ extended format has the normally implicit 1 explicit
+ present. Sigh! */
+ lx |= hx & 0x7fffffff;
+ se |= (u_int32_t)(lx|(-lx))>>31;
+ se = 0xfffe - se;
+ return (int)((u_int32_t)(se))>>16;
+}
+hidden_def (__isnanl)
+weak_alias (__isnanl, isnanl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrint.S b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
new file mode 100644
index 0000000000..a597183aab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__llrint)
+ fldl 4(%esp)
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fistpll (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__llrint)
+weak_alias (__llrint, llrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
new file mode 100644
index 0000000000..a4b574eccb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__llrintf)
+ flds 4(%esp)
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fistpll (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__llrintf)
+weak_alias (__llrintf, llrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
new file mode 100644
index 0000000000..7b48c02ef4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S
@@ -0,0 +1,36 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__llrintl)
+ fldt 4(%esp)
+ subl $8, %esp
+ cfi_adjust_cfa_offset (8)
+ fistpll (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__llrintl)
+weak_alias (__llrintl, llrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1p.S b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
new file mode 100644
index 0000000000..7978e76095
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ /* The fyl2xp1 can only be used for values in
+ -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+ 0.29 is a safe value.
+ */
+limit: .double 0.29
+one: .double 1.0
+
+DEFINE_DBL_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+ .text
+ENTRY(__log1p)
+ fldln2
+
+ fldl 4(%esp)
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ fxam
+ fnstsw
+ fld %st
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fabs
+ fcompl MO(limit)
+ fnstsw
+ sahf
+ jc 2f
+
+ faddl MO(one)
+ fyl2x
+ ret
+
+2: fyl2xp1
+ DBL_CHECK_FORCE_UFLOW_NONNAN
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+
+END (__log1p)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
new file mode 100644
index 0000000000..acaa299d94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S
@@ -0,0 +1,67 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_log1pf.S,v 1.4 1995/05/09 00:13:05 jtc Exp $")
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ /* The fyl2xp1 can only be used for values in
+ -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+ 0.29 is a safe value.
+ */
+limit: .float 0.29
+one: .float 1.0
+
+DEFINE_FLT_MIN
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+ .text
+ENTRY(__log1pf)
+ fldln2
+
+ flds 4(%esp)
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ fxam
+ fnstsw
+ fld %st
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4: fabs
+ fcomps MO(limit)
+ fnstsw
+ sahf
+ jc 2f
+
+ fadds MO(one)
+ fyl2x
+ ret
+
+2: fyl2xp1
+ FLT_CHECK_FORCE_UFLOW_NONNAN
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ ret
+
+END (__log1pf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
new file mode 100644
index 0000000000..0fd05cbdb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S
@@ -0,0 +1,76 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $")
+
+ .section .rodata
+
+ .align ALIGNARG(4)
+ /* The fyl2xp1 can only be used for values in
+ -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2
+ 0.29 is a safe value.
+ */
+limit: .tfloat 0.29
+ /* Please note: we use a double value here. Since 1.0 has
+ an exact representation this does not effect the accuracy
+ but it helps to optimize the code. */
+one: .double 1.0
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+/*
+ * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29,
+ * otherwise fyl2x with the needed extra computation.
+ */
+ .text
+ENTRY(__log1pl)
+ fldln2
+
+ fldt 4(%esp)
+
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+
+ fxam
+ fnstsw
+ fld %st
+ sahf
+ jc 3f // in case x is NaN or ±Inf
+4:
+ fabs
+ fldt MO(limit)
+ fcompp
+ fnstsw
+ sahf
+ jnc 2f
+
+ movzwl 4+8(%esp), %eax
+ xorb $0x80, %ah
+ cmpl $0xc040, %eax
+ jae 5f
+
+ faddl MO(one)
+5: fyl2x
+ ret
+
+2: fyl2xp1
+ ret
+
+3: jp 4b // in case x is ±Inf
+ fstp %st(1)
+ fstp %st(1)
+ fadd %st(0)
+ ret
+
+END (__log1pl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logb.S b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
new file mode 100644
index 0000000000..f78c091c8a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logb.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logb.S,v 1.4 1995/05/09 00:14:30 jtc Exp $")
+
+ENTRY(__logb)
+ fldl 4(%esp)
+ fxtract
+ fstp %st
+ ret
+END (__logb)
+weak_alias (__logb, logb)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbf.S b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
new file mode 100644
index 0000000000..91eb3d2925
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_logbf.S,v 1.3 1995/05/09 00:15:12 jtc Exp $")
+
+ENTRY(__logbf)
+ flds 4(%esp)
+ fxtract
+ fstp %st
+ ret
+END (__logbf)
+weak_alias (__logbf, logbf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbl.c b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
new file mode 100644
index 0000000000..391e2db489
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__logbl (long double x)
+{
+ long double res;
+
+ asm ("fxtract\n"
+ "fstp %%st" : "=t" (res) : "0" (x));
+ return res;
+}
+
+weak_alias (__logbl, logbl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrint.S b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
new file mode 100644
index 0000000000..79a374b399
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__lrint)
+ fldl 4(%esp)
+ subl $4, %esp
+ cfi_adjust_cfa_offset (4)
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__lrint)
+weak_alias (__lrint, lrint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
new file mode 100644
index 0000000000..fc6e68e073
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__lrintf)
+ flds 4(%esp)
+ subl $4, %esp
+ cfi_adjust_cfa_offset (4)
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__lrintf)
+weak_alias (__lrintf, lrintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
new file mode 100644
index 0000000000..ba6dbdf44c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S
@@ -0,0 +1,34 @@
+/* Round argument to nearest integral value according to current rounding
+ direction.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__lrintl)
+ fldt 4(%esp)
+ subl $4, %esp
+ cfi_adjust_cfa_offset (4)
+ fistpl (%esp)
+ fwait
+ popl %eax
+ cfi_adjust_cfa_offset (-4)
+ ret
+END(__lrintl)
+weak_alias (__lrintl, lrintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
new file mode 100644
index 0000000000..f7b79b6ff2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyint)
+ fldl 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ frndint
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__nearbyint)
+weak_alias (__nearbyint, nearbyint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
new file mode 100644
index 0000000000..92df2f87b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S
@@ -0,0 +1,20 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintf)
+ flds 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ frndint
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__nearbyintf)
+weak_alias (__nearbyintf, nearbyintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
new file mode 100644
index 0000000000..3b7d1e2436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */
+
+#include <machine/asm.h>
+
+ENTRY(__nearbyintl)
+ fldt 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ frndint
+ fnstsw
+ andl $0x1, %eax
+ orl %eax, 8(%esp)
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END (__nearbyintl)
+weak_alias (__nearbyintl, nearbyintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
new file mode 100644
index 0000000000..600ad7a8d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c
@@ -0,0 +1,125 @@
+/* s_nextafterl.c -- long double version of s_nextafter.c.
+ * Special version for i387.
+ * Conversion to long double by Ulrich Drepper,
+ * Cygnus Support, drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ * nextafterl(x,y)
+ * return the next machine floating-point number of x in the
+ * direction toward y.
+ * Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+long double __nextafterl(long double x, long double y)
+{
+ u_int32_t hx,hy,ix,iy;
+ u_int32_t lx,ly;
+ int32_t esx,esy;
+
+ GET_LDOUBLE_WORDS(esx,hx,lx,x);
+ GET_LDOUBLE_WORDS(esy,hy,ly,y);
+ ix = esx&0x7fff; /* |x| */
+ iy = esy&0x7fff; /* |y| */
+
+ /* Intel's extended format has the normally implicit 1 explicit
+ present. Sigh! */
+ if(((ix==0x7fff)&&(((hx&0x7fffffff)|lx)!=0)) || /* x is nan */
+ ((iy==0x7fff)&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */
+ return x+y;
+ if(x==y) return y; /* x=y, return y */
+ if((ix|hx|lx)==0) { /* x == 0 */
+ long double u;
+ SET_LDOUBLE_WORDS(x,esy&0x8000,0,1);/* return +-minsubnormal */
+ u = math_opt_barrier (x);
+ u = u * u;
+ math_force_eval (u); /* raise underflow flag */
+ return x;
+ }
+ if(esx>=0) { /* x > 0 */
+ if(esx>esy||((esx==esy) && (hx>hy||((hx==hy)&&(lx>ly))))) {
+ /* x > y, x -= ulp */
+ if(lx==0) {
+ if (hx <= 0x80000000) {
+ if (esx == 0) {
+ --hx;
+ } else {
+ esx -= 1;
+ hx = hx - 1;
+ if (esx > 0)
+ hx |= 0x80000000;
+ }
+ } else
+ hx -= 1;
+ }
+ lx -= 1;
+ } else { /* x < y, x += ulp */
+ lx += 1;
+ if(lx==0) {
+ hx += 1;
+ if (hx==0 || (esx == 0 && hx == 0x80000000)) {
+ esx += 1;
+ hx |= 0x80000000;
+ }
+ }
+ }
+ } else { /* x < 0 */
+ if(esy>=0||(esx>esy||((esx==esy)&&(hx>hy||((hx==hy)&&(lx>ly)))))){
+ /* x < y, x -= ulp */
+ if(lx==0) {
+ if (hx <= 0x80000000 && esx != 0xffff8000) {
+ esx -= 1;
+ hx = hx - 1;
+ if ((esx&0x7fff) > 0)
+ hx |= 0x80000000;
+ } else
+ hx -= 1;
+ }
+ lx -= 1;
+ } else { /* x > y, x += ulp */
+ lx += 1;
+ if(lx==0) {
+ hx += 1;
+ if (hx==0 || (esx == 0xffff8000 && hx == 0x80000000)) {
+ esx += 1;
+ hx |= 0x80000000;
+ }
+ }
+ }
+ }
+ esy = esx&0x7fff;
+ if(esy==0x7fff) {
+ long double u = x + x; /* overflow */
+ math_force_eval (u);
+ __set_errno (ERANGE);
+ }
+ if(esy==0) {
+ long double u = x*x; /* underflow */
+ math_force_eval (u); /* raise underflow flag */
+ __set_errno (ERANGE);
+ }
+ SET_LDOUBLE_WORDS(x,esx,hx,lx);
+ return x;
+}
+weak_alias (__nextafterl, nextafterl)
+strong_alias (__nextafterl, __nexttowardl)
+weak_alias (__nextafterl, nexttowardl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
new file mode 100644
index 0000000000..0b47044760
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c
@@ -0,0 +1,93 @@
+/* s_nexttoward.c
+ * Special i387 version
+ * Conversion from s_nextafter.c by Ulrich Drepper, Cygnus Support,
+ * drepper@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+/* IEEE functions
+ * nexttoward(x,y)
+ * return the next machine floating-point number of x in the
+ * direction toward y.
+ * Special cases:
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+double __nexttoward(double x, long double y)
+{
+ int32_t hx,ix,iy;
+ u_int32_t lx,hy,ly,esy;
+
+ EXTRACT_WORDS(hx,lx,x);
+ GET_LDOUBLE_WORDS(esy,hy,ly,y);
+ ix = hx&0x7fffffff; /* |x| */
+ iy = esy&0x7fff; /* |y| */
+
+ /* Intel's extended format has the normally implicit 1 explicit
+ present. Sigh! */
+ if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) || /* x is nan */
+ ((iy>=0x7fff)&&((hy&0x7fffffff)|ly)!=0)) /* y is nan */
+ return x+y;
+ if((long double) x==y) return y; /* x=y, return y */
+ if((ix|lx)==0) { /* x == 0 */
+ double u;
+ INSERT_WORDS(x,(esy&0x8000)<<16,1); /* return +-minsub */
+ u = math_opt_barrier (x);
+ u = u * u;
+ math_force_eval (u); /* raise underflow flag */
+ return x;
+ }
+ if(hx>=0) { /* x > 0 */
+ if (x > y) { /* x -= ulp */
+ if(lx==0) hx -= 1;
+ lx -= 1;
+ } else { /* x < y, x += ulp */
+ lx += 1;
+ if(lx==0) hx += 1;
+ }
+ } else { /* x < 0 */
+ if (x < y) { /* x -= ulp */
+ if(lx==0) hx -= 1;
+ lx -= 1;
+ } else { /* x > y, x += ulp */
+ lx += 1;
+ if(lx==0) hx += 1;
+ }
+ }
+ hy = hx&0x7ff00000;
+ if(hy>=0x7ff00000) {
+ double u = x+x; /* overflow */
+ math_force_eval (u);
+ __set_errno (ERANGE);
+ }
+ if(hy<0x00100000) {
+ double u = x*x; /* underflow */
+ math_force_eval (u); /* raise underflow flag */
+ __set_errno (ERANGE);
+ }
+ INSERT_WORDS(x,hx,lx);
+ return x;
+}
+weak_alias (__nexttoward, nexttoward)
+#ifdef NO_LONG_DOUBLE
+strong_alias (__nexttoward, __nexttowardl)
+weak_alias (__nexttoward, nexttowardl)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
new file mode 100644
index 0000000000..e1156d1e4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c
@@ -0,0 +1,77 @@
+/* s_nexttowardf.c -- float version of s_nextafter.c.
+ * Special i387 version.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#if defined(LIBM_SCCS) && !defined(lint)
+static char rcsid[] = "$NetBSD: $";
+#endif
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include <float.h>
+
+float __nexttowardf(float x, long double y)
+{
+ int32_t hx,ix,iy;
+ u_int32_t hy,ly,esy;
+
+ GET_FLOAT_WORD(hx,x);
+ GET_LDOUBLE_WORDS(esy,hy,ly,y);
+ ix = hx&0x7fffffff; /* |x| */
+ iy = esy&0x7fff; /* |y| */
+
+ /* Intel's extended format has the normally implicit 1 explicit
+ present. Sigh! */
+ if((ix>0x7f800000) || /* x is nan */
+ (iy>=0x7fff&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */
+ return x+y;
+ if((long double) x==y) return y; /* x=y, return y */
+ if(ix==0) { /* x == 0 */
+ float u;
+ SET_FLOAT_WORD(x,((esy&0x8000)<<16)|1);/* return +-minsub*/
+ u = math_opt_barrier (x);
+ u = u * u;
+ math_force_eval (u); /* raise underflow flag */
+ return x;
+ }
+ if(hx>=0) { /* x > 0 */
+ if(x > y) { /* x -= ulp */
+ hx -= 1;
+ } else { /* x < y, x += ulp */
+ hx += 1;
+ }
+ } else { /* x < 0 */
+ if(x < y) { /* x -= ulp */
+ hx -= 1;
+ } else { /* x > y, x += ulp */
+ hx += 1;
+ }
+ }
+ hy = hx&0x7f800000;
+ if(hy>=0x7f800000) {
+ float u = x+x; /* overflow */
+ math_force_eval (u);
+ __set_errno (ERANGE);
+ }
+ if(hy<0x00800000) {
+ float u = x*x; /* underflow */
+ math_force_eval (u); /* raise underflow flag */
+ __set_errno (ERANGE);
+ }
+ SET_FLOAT_WORD(x,hx);
+ return x;
+}
+weak_alias (__nexttowardf, nexttowardf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquo.S b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
new file mode 100644
index 0000000000..341285db30
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define DVDND PARMS
+#define DVSOR DVDND+8
+#define QUOP DVSOR+8
+
+ .text
+ENTRY (__remquo)
+
+ fldl DVSOR(%esp)
+ fldl DVDND(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ /* Compute the congruent of the quotient. */
+ movl %eax, %ecx
+ shrl $8, %eax
+ shrl $12, %ecx
+ andl $4, %ecx
+ andl $3, %eax
+ orl %eax, %ecx
+ leal (%ecx,%ecx,2),%ecx
+ movl $0xef2a60, %eax
+ shrl %cl, %eax
+ andl $7, %eax
+ movl QUOP(%esp), %ecx
+ movl DVDND+4(%esp), %edx
+ xorl DVSOR+4(%esp), %edx
+ testl $0x80000000, %edx
+ jz 1f
+ negl %eax
+1: movl %eax, (%ecx)
+
+ ret
+END (__remquo)
+weak_alias (__remquo, remquo)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquof.S b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
new file mode 100644
index 0000000000..62063f068f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define DVDND PARMS
+#define DVSOR DVDND+4
+#define QUOP DVSOR+4
+
+ .text
+ENTRY (__remquof)
+
+ flds DVSOR(%esp)
+ flds DVDND(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ /* Compute the congruent of the quotient. */
+ movl %eax, %ecx
+ shrl $8, %eax
+ shrl $12, %ecx
+ andl $4, %ecx
+ andl $3, %eax
+ orl %eax, %ecx
+ leal (%ecx,%ecx,2),%ecx
+ movl $0xef2a60, %eax
+ shrl %cl, %eax
+ andl $7, %eax
+ movl QUOP(%esp), %ecx
+ movl DVDND(%esp), %edx
+ xorl DVSOR(%esp), %edx
+ testl $0x80000000, %edx
+ jz 1f
+ negl %eax
+1: movl %eax, (%ecx)
+
+ ret
+END (__remquof)
+weak_alias (__remquof, remquof)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquol.S b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
new file mode 100644
index 0000000000..f3d84fc7c2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S
@@ -0,0 +1,45 @@
+/*
+ * Written by Ulrich Drepper <drepper@cygnus.com>.
+ * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define DVDND PARMS
+#define DVSOR DVDND+12
+#define QUOP DVSOR+12
+
+ .text
+ENTRY (__remquol)
+
+ fldt DVSOR(%esp)
+ fldt DVDND(%esp)
+1: fprem1
+ fstsw %ax
+ sahf
+ jp 1b
+ fstp %st(1)
+ /* Compute the congruent of the quotient. */
+ movl %eax, %ecx
+ shrl $8, %eax
+ shrl $12, %ecx
+ andl $4, %ecx
+ andl $3, %eax
+ orl %eax, %ecx
+ leal (%ecx,%ecx,2),%ecx
+ movl $0xef2a60, %eax
+ shrl %cl, %eax
+ andl $7, %eax
+ movl QUOP(%esp), %ecx
+ movl DVDND+8(%esp), %edx
+ xorl DVSOR+8(%esp), %edx
+ testl $0x8000, %edx
+ jz 1f
+ negl %eax
+1: movl %eax, (%ecx)
+
+ ret
+END (__remquol)
+weak_alias (__remquol, remquol)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rint.S b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
new file mode 100644
index 0000000000..be36c5f0ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rint.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rint.S,v 1.4 1995/05/09 00:16:08 jtc Exp $")
+
+ENTRY(__rint)
+ fldl 4(%esp)
+ frndint
+ ret
+END (__rint)
+weak_alias (__rint, rint)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintf.S b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
new file mode 100644
index 0000000000..2b358c1cf1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S
@@ -0,0 +1,15 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_rintf.S,v 1.3 1995/05/09 00:17:22 jtc Exp $")
+
+ENTRY(__rintf)
+ flds 4(%esp)
+ frndint
+ ret
+END (__rintf)
+weak_alias (__rintf, rintf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintl.c b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
new file mode 100644
index 0000000000..66af9cb675
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c
@@ -0,0 +1,18 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__rintl (long double x)
+{
+ long double res;
+
+ asm ("frndint" : "=t" (res) : "0" (x));
+ return res;
+}
+
+weak_alias (__rintl, rintl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
new file mode 100644
index 0000000000..1009713fbc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c
@@ -0,0 +1,2 @@
+/* Nothing to do. This function is the same as scalbn. So we define an
+ alias. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
new file mode 100644
index 0000000000..5e558c3540
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c
@@ -0,0 +1,2 @@
+/* Nothing to do. This function is the same as scalbnf. So we define an
+ alias. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
new file mode 100644
index 0000000000..cda2ec11c8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c
@@ -0,0 +1,2 @@
+/* Nothing to do. This function is the same as scalbnl. So we define an
+ alias. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
new file mode 100644
index 0000000000..4e90903115
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbn.S,v 1.4 1995/05/09 00:19:06 jtc Exp $")
+
+ENTRY(__scalbn)
+ fildl 12(%esp)
+ fldl 4(%esp)
+ fscale
+ fstp %st(1)
+ DBL_NARROW_EVAL
+ ret
+END (__scalbn)
+strong_alias (__scalbn, __scalbln)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbn, scalbln, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
new file mode 100644
index 0000000000..f8353c4c75
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S
@@ -0,0 +1,24 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+#include <i386-math-asm.h>
+
+RCSID("$NetBSD: s_scalbnf.S,v 1.3 1995/05/09 00:19:59 jtc Exp $")
+
+ENTRY(__scalbnf)
+ fildl 8(%esp)
+ flds 4(%esp)
+ fscale
+ fstp %st(1)
+ FLT_NARROW_EVAL
+ ret
+END (__scalbnf)
+strong_alias (__scalbnf, __scalblnf)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnf, scalblnf, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
new file mode 100644
index 0000000000..839b5ff353
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S
@@ -0,0 +1,23 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: $")
+
+ENTRY(__scalbnl)
+ fildl 16(%esp)
+ fldt 4(%esp)
+ fscale
+ fstp %st(1)
+ ret
+END (__scalbnl)
+strong_alias (__scalbnl, __scalblnl)
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20)
+compat_symbol (libc, __scalbnl, scalblnl, GLIBC_2_1);
+#endif
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significand.S b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
new file mode 100644
index 0000000000..4859b7ed71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significand.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significand.S,v 1.4 1995/05/09 00:21:47 jtc Exp $")
+
+ENTRY(__significand)
+ fldl 4(%esp)
+ fxtract
+ fstp %st(1)
+ ret
+END (__significand)
+weak_alias (__significand, significand)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandf.S b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
new file mode 100644
index 0000000000..3a2de97759
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S
@@ -0,0 +1,16 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+
+RCSID("$NetBSD: s_significandf.S,v 1.3 1995/05/09 00:24:07 jtc Exp $")
+
+ENTRY(__significandf)
+ flds 4(%esp)
+ fxtract
+ fstp %st(1)
+ ret
+END (__significandf)
+weak_alias (__significandf, significandf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandl.c b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
new file mode 100644
index 0000000000..b8cb093502
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c
@@ -0,0 +1,19 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Changes for long double by Ulrich Drepper <drepper@cygnus.com>
+ * Public domain.
+ */
+
+#include <math_private.h>
+
+long double
+__significandl (long double x)
+{
+ long double res;
+
+ asm ("fxtract\n"
+ "fstp %%st(1)" : "=t" (res) : "0" (x));
+ return res;
+}
+
+weak_alias (__significandl, significandl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_trunc.S b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
new file mode 100644
index 0000000000..e9a850b877
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S
@@ -0,0 +1,37 @@
+/* Truncate double value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ENTRY(__trunc)
+ fldl 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ movl $0xc00, %edx
+ orl 4(%esp), %edx
+ movl %edx, (%esp)
+ fldcw (%esp)
+ frndint
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END(__trunc)
+weak_alias (__trunc, trunc)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncf.S b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
new file mode 100644
index 0000000000..a93f5b9a2e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S
@@ -0,0 +1,37 @@
+/* Truncate float value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ENTRY(__truncf)
+ flds 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ movl $0xc00, %edx
+ orl 4(%esp), %edx
+ movl %edx, (%esp)
+ fldcw (%esp)
+ frndint
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END(__truncf)
+weak_alias (__truncf, truncf)
diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncl.S b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
new file mode 100644
index 0000000000..a884123612
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S
@@ -0,0 +1,40 @@
+/* Truncate long double value.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <machine/asm.h>
+
+ENTRY(__truncl)
+ fldt 4(%esp)
+ subl $32, %esp
+ cfi_adjust_cfa_offset (32)
+ fnstenv 4(%esp)
+ movl $0xc00, %edx
+ orl 4(%esp), %edx
+ movl %edx, (%esp)
+ fldcw (%esp)
+ frndint
+ fnstsw
+ andl $0x1, %eax
+ orl %eax, 8(%esp)
+ fldenv 4(%esp)
+ addl $32, %esp
+ cfi_adjust_cfa_offset (-32)
+ ret
+END(__truncl)
+weak_alias (__truncl, truncl)
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowexp.c b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowexp.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/slowpow.c b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
new file mode 100644
index 0000000000..1cc8931700
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/slowpow.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/t_exp.c b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
new file mode 100644
index 0000000000..fd37963b05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/t_exp.c
@@ -0,0 +1 @@
+/* Empty. Not needed. */
diff --git a/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
new file mode 100644
index 0000000000..ddd36d0964
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c
@@ -0,0 +1,8 @@
+/* The inline __ieee754_sqrt is not correctly rounding; it's OK for
+ most internal uses in glibc, but not for sqrt itself. */
+#define __ieee754_sqrt __avoid_ieee754_sqrt
+#include <math.h>
+#include <math_private.h>
+#undef __ieee754_sqrt
+extern double __ieee754_sqrt (double);
+#include <math/w_sqrt_compat.c>
diff --git a/REORG.TODO/sysdeps/i386/gccframe.h b/REORG.TODO/sysdeps/i386/gccframe.h
new file mode 100644
index 0000000000..579da40ae9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gccframe.h
@@ -0,0 +1,27 @@
+/* Definition of object in frame unwind info. i386 version.
+ Copyright (C) 2001-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define DWARF_FRAME_REGISTERS 17
+
+#define CRT_GET_RFIB_DATA(BASE) \
+ { \
+ register void *__ebx __asm__("ebx");\
+ BASE = __ebx; \
+ }
+
+#include <sysdeps/generic/gccframe.h>
diff --git a/REORG.TODO/sysdeps/i386/gmp-mparam.h b/REORG.TODO/sysdeps/i386/gmp-mparam.h
new file mode 100644
index 0000000000..7ea503a403
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/REORG.TODO/sysdeps/i386/htonl.S b/REORG.TODO/sysdeps/i386/htonl.S
new file mode 100644
index 0000000000..63279bb6e1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htonl.S
@@ -0,0 +1,34 @@
+/* Change byte order in word. For Intel 80x86, x >= 4.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+ INPUT PARAMETERS:
+ word (sp + 4)
+*/
+
+ .text
+ENTRY (htonl)
+ movl 4(%esp), %eax
+ bswap %eax
+ ret
+END (htonl)
+
+weak_alias (htonl, ntohl)
diff --git a/REORG.TODO/sysdeps/i386/htons.S b/REORG.TODO/sysdeps/i386/htons.S
new file mode 100644
index 0000000000..a3c53a9944
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/htons.S
@@ -0,0 +1,35 @@
+/* Change byte order in word. For Intel 80x86, x >= 3.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+ INPUT PARAMETERS:
+ word (sp + 4)
+*/
+
+ .text
+ENTRY (htons)
+ movl 4(%esp), %eax
+ andl $0xffff, %eax
+ rorw $8, %ax
+ ret
+END (htons)
+
+weak_alias (htons, ntohs)
diff --git a/REORG.TODO/sysdeps/i386/i386-mcount.S b/REORG.TODO/sysdeps/i386/i386-mcount.S
new file mode 100644
index 0000000000..733b8c78e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i386-mcount.S
@@ -0,0 +1,79 @@
+/* i386-specific implementation of profiling support.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+ must not clobber any register. This has several reasons:
+ - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+ use of profiling together with nested functions
+ - the ELF `fixup' function uses GCC's regparm feature
+ - some (future) systems might want to pass parameters in registers. */
+
+ .globl C_SYMBOL_NAME(_mcount)
+ .type C_SYMBOL_NAME(_mcount), @function
+ .align ALIGNARG(4)
+C_LABEL(_mcount)
+ /* Save the caller-clobbered registers. */
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ movl 12(%esp), %edx
+ movl 4(%ebp), %eax
+
+ /* No need to access the PLT or GOT, __mcount_internal is an
+ internal function and we can make a relative call. */
+ call C_SYMBOL_NAME(__mcount_internal)
+
+ /* Pop the saved registers. Please note that `mcount' has no
+ return value. */
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+ ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(_mcount))
+
+#undef mcount
+weak_alias (_mcount, mcount)
+
+ /* Same as above, but doesn't require a frame pointer */
+ .globl C_SYMBOL_NAME(__fentry__)
+ .type C_SYMBOL_NAME(__fentry__), @function
+ .align ALIGNARG(4)
+C_LABEL(__fentry__)
+ /* Save the caller-clobbered registers. */
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ movl 12(%esp), %edx
+ movl 16(%esp), %eax
+
+ /* No need to access the PLT or GOT, __mcount_internal is an
+ internal function and we can make a relative call. */
+ call C_SYMBOL_NAME(__mcount_internal)
+
+ /* Pop the saved registers. Please note that `__fentry__' has no
+ return value. */
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+ ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(__fentry__))
diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..f73df092f0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+ sum in a third limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_add_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl S2(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl SIZE(%esp),%ecx
+ movl (%ebx),%ebp
+ cfi_rel_offset (ebp, 4)
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx /* zero carry flag */
+ jz L(end)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+
+ ALIGN (3)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %ebp,%eax
+ movl 4(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 8(%ebx),%ebp
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ adcl %ebp,%eax
+ movl 12(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 16(%ebx),%ebp
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ adcl %ebp,%eax
+ movl 20(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 24(%ebx),%ebp
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %ebp,%eax
+ movl 28(%ebx),%ebp
+ adcl %ebp,%edx
+ movl 32(%ebx),%ebp
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebx),%ebx
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+L(end):
+ decl %edx /* test %edx w/o clobbering carry */
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ adcl %ebp,%eax
+ movl 4(%ebx),%ebp
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebx),%ebx
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ adcl %ebp,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..a713192982
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ the result to a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_addmul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %ebp, %eax
+ movl (%res_ptr,%size,4), %ebp
+
+ adcl $0, %edx
+ addl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_addmul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S
new file mode 100644
index 0000000000..2a106719a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i586/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h
new file mode 100644
index 0000000000..4711212e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MINIMUM_ISA 586
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..7941c28d9d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_lshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_lshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebp, 0)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl SIZE(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions. */
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) /* jump if s_ptr + 1 >= res_ptr */
+ leal (%esi,%ebx,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) /* jump if res_ptr >= s_ptr + size */
+
+L(normal):
+ leal -4(%edi,%ebx,4),%edi
+ leal -4(%esi,%ebx,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl %cl,%edx,%eax /* compute carry limb */
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+ jz L(end)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(oop): movl -28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl %cl,%eax,%ebp
+ shldl %cl,%edx,%eax
+ movl %ebp,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebp
+ movl -12(%esi),%eax
+ shldl %cl,%ebp,%edx
+ shldl %cl,%eax,%ebp
+ movl %edx,-8(%edi)
+ movl %ebp,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebp
+ shldl %cl,%edx,%eax
+ shldl %cl,%ebp,%edx
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl %cl,%eax,%ebp
+ shldl %cl,%edx,%eax
+ movl %ebp,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebx
+ jnz L(oop)
+
+L(end): popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ andl $7,%ebx
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl %cl,%eax,%edx
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebx
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx /* compute least significant limb */
+ movl %edx,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+/* We loop from least significant end of the arrays, which is only
+ permissible if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination.
+*/
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebp, 4)
+ cfi_rel_offset (ebx, 0)
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+
+ addl %edx,%edx
+ incl %ebx
+ decl %ebx
+ jz L(Lend)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(Loop):
+ movl 28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebp,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebp
+ movl 12(%esi),%eax
+ adcl %ebp,%ebp
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebp,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebp
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebp,%ebp
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebp,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi /* use leal not to clobber carry */
+ leal 32(%edi),%edi
+ decl %ebx
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ sbbl %eax,%eax /* save carry in %eax */
+ andl $7,%ebx
+ jz L(Lend2)
+ addl %eax,%eax /* restore carry from eax */
+L(Loop2):
+ movl %edx,%ebp
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebp,(%edi)
+
+ leal 4(%esi),%esi /* use leal not to clobber carry */
+ leal 4(%edi),%edi
+ decl %ebx
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax /* restore carry from eax */
+L(L1): movl %edx,(%edi) /* store last limb */
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h
new file mode 100644
index 0000000000..39f020a746
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h
@@ -0,0 +1,95 @@
+/* memcopy.h -- definitions for memory copy functions. Pentium version.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ Contributed by Torbjorn Granlund (tege@sics.se).
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Get the i386 definitions. We will override some of them below. */
+#include <sysdeps/i386/memcopy.h>
+
+/* Written like this, the Pentium pipeline can execute the loop at a
+ sustained rate of 2 instructions/clock, or asymptotically 480
+ Mbytes/second at 60Mhz. */
+
+#undef WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
+ do \
+ { \
+ asm volatile ("subl $32,%2\n" \
+ "js 2f\n" \
+ "movl 0(%0),%%edx\n" /* alloc dest line */ \
+ "1:\n" \
+ "movl 28(%0),%%eax\n" /* alloc dest line */ \
+ "subl $32,%2\n" /* decr loop count */ \
+ "movl 0(%1),%%eax\n" /* U pipe */ \
+ "movl 4(%1),%%edx\n" /* V pipe */ \
+ "movl %%eax,0(%0)\n" /* U pipe */ \
+ "movl %%edx,4(%0)\n" /* V pipe */ \
+ "movl 8(%1),%%eax\n" \
+ "movl 12(%1),%%edx\n" \
+ "movl %%eax,8(%0)\n" \
+ "movl %%edx,12(%0)\n" \
+ "movl 16(%1),%%eax\n" \
+ "movl 20(%1),%%edx\n" \
+ "movl %%eax,16(%0)\n" \
+ "movl %%edx,20(%0)\n" \
+ "movl 24(%1),%%eax\n" \
+ "movl 28(%1),%%edx\n" \
+ "movl %%eax,24(%0)\n" \
+ "movl %%edx,28(%0)\n" \
+ "leal 32(%1),%1\n" /* update src ptr */ \
+ "leal 32(%0),%0\n" /* update dst ptr */ \
+ "jns 1b\n" \
+ "2: addl $32,%2" : \
+ "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \
+ "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
+ "ax", "dx"); \
+ } while (0)
+
+#undef WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \
+ do \
+ { \
+ asm volatile ("subl $32,%2\n" \
+ "js 2f\n" \
+ "movl -4(%0),%%edx\n" \
+ "1:\n" \
+ "movl -32(%0),%%eax\n" \
+ "subl $32,%2\n" \
+ "movl -4(%1),%%eax\n" \
+ "movl -8(%1),%%edx\n" \
+ "movl %%eax,-4(%0)\n" \
+ "movl %%edx,-8(%0)\n" \
+ "movl -12(%1),%%eax\n" \
+ "movl -16(%1),%%edx\n" \
+ "movl %%eax,-12(%0)\n" \
+ "movl %%edx,-16(%0)\n" \
+ "movl -20(%1),%%eax\n" \
+ "movl -24(%1),%%edx\n" \
+ "movl %%eax,-20(%0)\n" \
+ "movl %%edx,-24(%0)\n" \
+ "movl -28(%1),%%eax\n" \
+ "movl -32(%1),%%edx\n" \
+ "movl %%eax,-28(%0)\n" \
+ "movl %%edx,-32(%0)\n" \
+ "leal -32(%1),%1\n" \
+ "leal -32(%0),%0\n" \
+ "jns 1b\n" \
+ "2: addl $32,%2" : \
+ "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \
+ "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \
+ "ax", "dx"); \
+ } while (0)
diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S
new file mode 100644
index 0000000000..6474a3f653
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S
@@ -0,0 +1,124 @@
+/* Highly optimized version for i586.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY (__memcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 4)
+ movl SRC(%esp), %esi
+ cfi_rel_offset (esi, 0)
+ movl LEN(%esp), %ecx
+ movl %edi, %eax
+
+ /* We need this in any case. */
+ cld
+
+ /* Cutoff for the big loop is a size of 32 bytes since otherwise
+ the loop will never be entered. */
+ cmpl $32, %ecx
+ jbe L(1)
+
+ negl %eax
+ andl $3, %eax
+ subl %eax, %ecx
+ xchgl %eax, %ecx
+
+ rep; movsb
+
+ movl %eax, %ecx
+ subl $32, %ecx
+ js L(2)
+
+ /* Read ahead to make sure we write in the cache since the stupid
+ i586 designers haven't implemented read-on-write-miss. */
+ movl (%edi), %eax
+L(3): movl 28(%edi), %edx
+
+ /* Now correct the loop counter. Please note that in the following
+ code the flags are not changed anymore. */
+ subl $32, %ecx
+
+ movl (%esi), %eax
+ movl 4(%esi), %edx
+ movl %eax, (%edi)
+ movl %edx, 4(%edi)
+ movl 8(%esi), %eax
+ movl 12(%esi), %edx
+ movl %eax, 8(%edi)
+ movl %edx, 12(%edi)
+ movl 16(%esi), %eax
+ movl 20(%esi), %edx
+ movl %eax, 16(%edi)
+ movl %edx, 20(%edi)
+ movl 24(%esi), %eax
+ movl 28(%esi), %edx
+ movl %eax, 24(%edi)
+ movl %edx, 28(%edi)
+
+ leal 32(%esi), %esi
+ leal 32(%edi), %edi
+
+ jns L(3)
+
+ /* Correct extra loop counter modification. */
+L(2): addl $32, %ecx
+#ifndef USE_AS_MEMPCPY
+ movl DEST(%esp), %eax
+#endif
+
+L(1): rep; movsb
+
+#ifdef USE_AS_MEMPCPY
+ movl %edi, %eax
+#endif
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memcpy)
+#ifndef USE_AS_MEMPCPY
+libc_hidden_builtin_def (memcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
new file mode 100644
index 0000000000..720a4c0923
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_MEMPCPY
+#define memcpy __mempcpy
+#define __memcpy_chk __mempcpy_chk
+#include <sysdeps/i386/i586/memcpy.S>
+
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S
new file mode 100644
index 0000000000..4f8f1bcf94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memset.S
@@ -0,0 +1,121 @@
+/* memset/bzero -- set memory area to CH/0
+ Highly optimized version for ix86, x>=5.
+ Copyright (C) 1996-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Torbjorn Granlund, <tege@matematik.su.se>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define DEST RTN
+#ifdef USE_AS_BZERO
+# define LEN DEST+4
+#else
+# define CHR DEST+4
+# define LEN CHR+4
+#endif
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 0)
+ movl LEN(%esp), %edx
+#ifdef USE_AS_BZERO
+ xorl %eax, %eax /* we fill with 0 */
+#else
+ movb CHR(%esp), %al
+ movb %al, %ah
+ movl %eax, %ecx
+ shll $16, %eax
+ movw %cx, %ax
+#endif
+ cld
+
+/* If less than 36 bytes to write, skip tricky code (it wouldn't work). */
+ cmpl $36, %edx
+ movl %edx, %ecx /* needed when branch is taken! */
+ jl L(2)
+
+/* First write 0-3 bytes to make the pointer 32-bit aligned. */
+ movl %edi, %ecx /* Copy ptr to ecx... */
+ negl %ecx /* ...and negate that and... */
+ andl $3, %ecx /* ...mask to get byte count. */
+ subl %ecx, %edx /* adjust global byte count */
+ rep
+ stosb
+
+ subl $32, %edx /* offset count for unrolled loop */
+ movl (%edi), %ecx /* Fetch destination cache line */
+
+ .align 2, 0x90 /* supply 0x90 for broken assemblers */
+L(1): movl 28(%edi), %ecx /* allocate cache line for destination */
+ subl $32, %edx /* decr loop count */
+ movl %eax, 0(%edi) /* store words pairwise */
+ movl %eax, 4(%edi)
+ movl %eax, 8(%edi)
+ movl %eax, 12(%edi)
+ movl %eax, 16(%edi)
+ movl %eax, 20(%edi)
+ movl %eax, 24(%edi)
+ movl %eax, 28(%edi)
+ leal 32(%edi), %edi /* update destination pointer */
+ jge L(1)
+
+ leal 32(%edx), %ecx /* reset offset count */
+
+/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
+L(2): shrl $2, %ecx /* convert byte count to longword count */
+ rep
+ stosl
+
+/* Finally write the last 0-3 bytes. */
+ movl %edx, %ecx
+ andl $3, %ecx
+ rep
+ stosb
+
+#ifndef USE_AS_BZERO
+ /* Load result (only if used as memset). */
+ movl DEST(%esp), %eax /* start address of destination is result */
+#endif
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+ && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h
new file mode 100644
index 0000000000..c8170874d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/memusage.h
@@ -0,0 +1 @@
+#include "../i686/memusage.h"
diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..bd3a07de90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,90 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ the result in a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_mul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..24c76ee0bb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,255 @@
+/* Pentium optimized __mpn_rshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_rshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebp, 0)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl SIZE(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl CNT(%esp),%ecx
+
+/* We can use faster code for shift-by-1 under certain conditions. */
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) /* jump if res_ptr + 1 >= s_ptr */
+ leal (%edi,%ebx,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) /* jump if s_ptr >= res_ptr + size */
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl %cl,%edx,%eax /* compute carry limb */
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+ jz L(end)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl %cl,%eax,%ebp
+ shrdl %cl,%edx,%eax
+ movl %ebp,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebp
+ movl 12(%esi),%eax
+ shrdl %cl,%ebp,%edx
+ shrdl %cl,%eax,%ebp
+ movl %edx,8(%edi)
+ movl %ebp,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebp
+ shrdl %cl,%edx,%eax
+ shrdl %cl,%ebp,%edx
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl %cl,%eax,%ebp
+ shrdl %cl,%edx,%eax
+ movl %ebp,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebx
+ jnz L(oop)
+
+L(end): popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ andl $7,%ebx
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl %cl,%eax,%edx /* compute result limb */
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebx
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx /* compute most significant limb */
+ movl %edx,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+/* We loop from least significant end of the arrays, which is only
+ permissible if the source and destination don't overlap, since the
+ function is documented to work for overlapping source and destination.
+*/
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebp, 4)
+ cfi_rel_offset (ebx, 0)
+L(special):
+ leal -4(%edi,%ebx,4),%edi
+ leal -4(%esi,%ebx,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebx
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ shrl $3,%ebx
+
+ shrl $1,%edx
+ incl %ebx
+ decl %ebx
+ jz L(Lend)
+
+ movl (%edi),%eax /* fetch destination cache line */
+
+ ALIGN (2)
+L(Loop):
+ movl -28(%edi),%eax /* fetch destination cache line */
+ movl %edx,%ebp
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl $1,%eax
+ movl %ebp,(%edi)
+ rcrl $1,%edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebp
+ movl -12(%esi),%eax
+ rcrl $1,%ebp
+ movl %edx,-8(%edi)
+ rcrl $1,%eax
+ movl %ebp,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebp
+ rcrl $1,%edx
+ movl %eax,-16(%edi)
+ rcrl $1,%ebp
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl $1,%eax
+ movl %ebp,-24(%edi)
+ rcrl $1,%edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi /* use leal not to clobber carry */
+ leal -32(%edi),%edi
+ decl %ebx
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ sbbl %eax,%eax /* save carry in %eax */
+ andl $7,%ebx
+ jz L(Lend2)
+ addl %eax,%eax /* restore carry from eax */
+L(Loop2):
+ movl %edx,%ebp
+ movl (%esi),%edx
+ rcrl $1,%edx
+ movl %ebp,(%edi)
+
+ leal -4(%esi),%esi /* use leal not to clobber carry */
+ leal -4(%edi),%edi
+ decl %ebx
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax /* restore carry from eax */
+L(L1): movl %edx,(%edi) /* store last limb */
+
+ movl $0,%eax
+ rcrl $1,%eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
new file mode 100644
index 0000000000..8691efd01c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S
@@ -0,0 +1,8 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+
+#include <sysdeps/i386/i586/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..02f66b8f72
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,348 @@
+/* Find character CH in a NUL terminated string.
+ Highly optimized version for ix85, x>=5.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+ processors. This is mainly done by using the two pipelines. The
+ version optimized for i486 is weak in this aspect because to get
+ as much parallelism we have to execute some *more* instructions.
+
+ The code below is structured to reflect the pairing of the instructions
+ as *I think* it is. I have no processor data book to verify this.
+ If you find something you think is incorrect let me know. */
+
+
+/* The magic value which is used throughout in the whole code. */
+#define magic 0xfefefeff
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (strchr)
+
+ pushl %edi /* Save callee-safe registers. */
+ cfi_adjust_cfa_offset (-4)
+ pushl %esi
+ cfi_adjust_cfa_offset (-4)
+
+ pushl %ebx
+ cfi_adjust_cfa_offset (-4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (-4)
+
+ movl STR(%esp), %eax
+ movl CHR(%esp), %edx
+
+ movl %eax, %edi /* duplicate string pointer for later */
+ cfi_rel_offset (edi, 12)
+ xorl %ecx, %ecx /* clear %ecx */
+
+ /* At the moment %edx contains C. What we need for the
+ algorithm is C in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* now it is 0|0|c|c */
+ movb %dl, %cl /* we construct the lower half in %ecx */
+
+ shll $16, %edx /* now %edx is c|c|0|0 */
+ movb %cl, %ch /* now %ecx is 0|0|c|c */
+
+ orl %ecx, %edx /* and finally c|c|c|c */
+ andl $3, %edi /* mask alignment bits */
+
+ jz L(11) /* alignment is 0 => start loop */
+
+ movb %dl, %cl /* 0 is needed below */
+ jp L(0) /* exactly two bits set */
+
+ xorb (%eax), %cl /* is byte the one we are looking for? */
+ jz L(out) /* yes => return pointer */
+
+ xorb %dl, %cl /* load single byte and test for NUL */
+ je L(3) /* yes => return NULL */
+
+ movb 1(%eax), %cl /* load single byte */
+ incl %eax
+
+ cmpb %cl, %dl /* is byte == C? */
+ je L(out) /* aligned => return pointer */
+
+ cmpb $0, %cl /* is byte NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax
+ decl %edi
+
+ jne L(11)
+
+L(0): movb (%eax), %cl /* load single byte */
+
+ cmpb %cl, %dl /* is byte == C? */
+ je L(out) /* aligned => return pointer */
+
+ cmpb $0, %cl /* is byte NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebx, 4)
+ cfi_rel_offset (ebp, 0)
+
+ /* The following code is the preparation for the loop. The
+ four instruction up to `L1' will not be executed in the loop
+ because the same code is found at the end of the loop, but
+ there it is executed in parallel with other instructions. */
+L(11): movl (%eax), %ecx
+ movl $magic, %ebp
+
+ movl $magic, %edi
+ addl %ecx, %ebp
+
+ /* The main loop: it looks complex and indeed it is. I would
+ love to say `it was hard to write, so it should he hard to
+ read' but I will give some more hints. To fully understand
+ this code you should first take a look at the i486 version.
+ The basic algorithm is the same, but here the code organized
+ in a way which permits to use both pipelines all the time.
+
+ I tried to make it a bit more understandable by indenting
+ the code according to stage in the algorithm. It goes as
+ follows:
+ check for 0 in 1st word
+ check for C in 1st word
+ check for 0 in 2nd word
+ check for C in 2nd word
+ check for 0 in 3rd word
+ check for C in 3rd word
+ check for 0 in 4th word
+ check for C in 4th word
+
+ Please note that doing the test for NUL before the test for
+ C allows us to overlap the test for 0 in the next word with
+ the test for C. */
+
+L(1): xorl %ecx, %ebp /* (word^magic) */
+ addl %ecx, %edi /* add magic word */
+
+ leal 4(%eax), %eax /* increment pointer */
+ jnc L(4) /* previous addl caused overflow? */
+
+ movl %ecx, %ebx /* duplicate original word */
+ orl $magic, %ebp /* (word^magic)|magic */
+
+ addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
+ jne L(4) /* yes => we found word with NUL */
+
+ movl $magic, %esi /* load magic value */
+ xorl %edx, %ebx /* clear words which are C */
+
+ movl (%eax), %ecx
+ addl %ebx, %esi /* (word+magic) */
+
+ movl $magic, %edi
+ jnc L(5) /* previous addl caused overflow? */
+
+ movl %edi, %ebp
+ xorl %ebx, %esi /* (word+magic)^word */
+
+ addl %ecx, %ebp
+ orl $magic, %esi /* ((word+magic)^word)|magic */
+
+ addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
+ jne L(5) /* yes => we found word with C */
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+ jne L(5)
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+ jne L(5)
+
+ xorl %ecx, %ebp
+ addl %ecx, %edi
+
+ leal 4(%eax), %eax
+ jnc L(4)
+
+ movl %ecx, %ebx
+ orl $magic, %ebp
+
+ addl $1, %ebp
+ jne L(4)
+
+ movl $magic, %esi
+ xorl %edx, %ebx
+
+ movl (%eax), %ecx
+ addl %ebx, %esi
+
+ movl $magic, %edi
+ jnc L(5)
+
+ movl %edi, %ebp
+ xorl %ebx, %esi
+
+ addl %ecx, %ebp
+ orl $magic, %esi
+
+ addl $1, %esi
+
+ je L(1)
+
+ /* We know there is no NUL byte but a C byte in the word.
+ %ebx contains NUL in this particular byte. */
+L(5): subl $4, %eax /* adjust pointer */
+ testb %bl, %bl /* first byte == C? */
+
+ jz L(out) /* yes => return pointer */
+
+ incl %eax /* increment pointer */
+ testb %bh, %bh /* second byte == C? */
+
+ jz L(out) /* yes => return pointer */
+
+ shrl $16, %ebx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmp $0, %bl /* third byte == C */
+ je L(out) /* yes => return pointer */
+
+ incl %eax /* increment pointer */
+
+L(out): popl %ebp /* restore saved registers */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebx, 4)
+ cfi_rel_offset (ebp, 0)
+ /* We know there is a NUL byte in the word. But we have to test
+ whether there is an C byte before it in the word. */
+L(4): subl $4, %eax /* adjust pointer */
+ cmpb %dl, %cl /* first byte == C? */
+
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %cl /* first byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ cmpb %dl, %ch /* second byte == C? */
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %ch /* second byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ shrl $16, %ecx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmpb %dl, %cl /* third byte == C? */
+ je L(out) /* yes => return pointer */
+
+ cmpb $0, %cl /* third byte == NUL? */
+ je L(3) /* yes => return NULL */
+
+ incl %eax /* increment pointer */
+
+ /* The test four the fourth byte is necessary! */
+ cmpb %dl, %ch /* fourth byte == C? */
+ je L(out) /* yes => return pointer */
+
+L(3): xorl %eax, %eax
+ jmp L(out)
+END (strchr)
+
+#undef index
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S
new file mode 100644
index 0000000000..a444604f4f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S
@@ -0,0 +1,169 @@
+/* strcpy/stpcpy implementation for i586.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+12 /* space for 3 saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+
+#ifndef USE_AS_STPCPY
+# define STRCPY strcpy
+#endif
+
+#define magic 0xfefefeff
+
+ .text
+ENTRY (STRCPY)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 8)
+ movl SRC(%esp), %esi
+ cfi_rel_offset (esi, 4)
+
+ xorl %eax, %eax
+ leal -1(%esi), %ecx
+
+ movl $magic, %ebx
+ cfi_rel_offset (ebx, 0)
+ andl $3, %ecx
+
+#ifdef PIC
+ call 2f
+ cfi_adjust_cfa_offset (4)
+2: popl %edx
+ cfi_adjust_cfa_offset (-4)
+ /* 0xb is the distance between 2: and 1: but we avoid writing
+ 1f-2b because the assembler generates worse code. */
+ leal 0xb(%edx,%ecx,8), %ecx
+#else
+ leal 1f(,%ecx,8), %ecx
+#endif
+
+ jmp *%ecx
+
+ .align 8
+1:
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+ orb (%esi), %al
+ jz L(end)
+ stosb
+ xorl %eax, %eax
+ incl %esi
+
+L(1): movl (%esi), %ecx
+ leal 4(%esi),%esi
+
+ subl %ecx, %eax
+ addl %ebx, %ecx
+
+ decl %eax
+ jnc L(3)
+
+ movl %ecx, %edx
+ xorl %ecx, %eax
+
+ subl %ebx, %edx
+ andl $~magic, %eax
+
+ jne L(4)
+
+ movl %edx, (%edi)
+ leal 4(%edi),%edi
+
+ jmp L(1)
+
+L(3): movl %ecx, %edx
+
+ subl %ebx, %edx
+
+L(4): movb %dl, (%edi)
+ testb %dl, %dl
+
+ movl %edx, %eax
+ jz L(end2)
+
+ shrl $16, %eax
+ movb %dh, 1(%edi)
+#ifdef USE_AS_STPCPY
+ addl $1, %edi
+#endif
+
+ cmpb $0, %dh
+ jz L(end2)
+
+#ifdef USE_AS_STPCPY
+ movb %al, 1(%edi)
+ addl $1, %edi
+
+ cmpb $0, %al
+ jz L(end2)
+
+ addl $1, %edi
+#else
+ movb %al, 2(%edi)
+ testb %al, %al
+
+ leal 3(%edi), %edi
+ jz L(end2)
+#endif
+
+L(end): movb %ah, (%edi)
+
+L(end2):
+#ifdef USE_AS_STPCPY
+ movl %edi, %eax
+#else
+ movl DEST(%esp), %eax
+#endif
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (STRCPY)
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..cfea2e020f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,182 @@
+/* strlen -- Compute length of NUL terminated string.
+ Highly optimized version for ix86, x>=5.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/* This version is especially optimized for the i586 (and following?)
+ processors. This is mainly done by using the two pipelines. The
+ version optimized for i486 is weak in this aspect because to get
+ as much parallelism we have to execute some *more* instructions.
+
+ The code below is structured to reflect the pairing of the instructions
+ as *I think* it is. I have no processor data book to verify this.
+ If you find something you think is incorrect let me know. */
+
+
+/* The magic value which is used throughout in the whole code. */
+#define magic 0xfefefeff
+
+#define PARMS 4 /* no space for saved regs */
+#define STR PARMS
+
+ .text
+ENTRY (strlen)
+
+ movl STR(%esp), %eax
+ movl $3, %edx /* load mask (= 3) */
+
+ andl %eax, %edx /* separate last two bits of address */
+
+ jz L(1) /* aligned => start loop */
+ jp L(0) /* exactly two bits set */
+
+ cmpb %dh, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ cmpb %dh, (%eax) /* is byte NUL? */
+
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ xorl $2, %edx
+
+ jz L(1)
+
+L(0): cmpb %dh, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+ xorl %edx, %edx /* We need %edx == 0 for later */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ Note: %edx == 0 in any case here. */
+
+L(1):
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ jne L(3) /* yes => determine byte */
+
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ addl $4, %eax /* adjust pointer for *next* word */
+
+ subl %ecx, %edx /* first step to negate word */
+ addl $magic, %ecx /* add magic word */
+
+ decl %edx /* complete negation of word */
+ jnc L(3) /* previous addl caused overflow? */
+
+ xorl %ecx, %edx /* (word+magic)^word */
+
+ andl $~magic, %edx /* any of the carry flags set? */
+
+ je L(1) /* no => start loop again */
+
+
+L(3): subl $4, %eax /* correct too early pointer increment */
+ subl $magic, %ecx
+
+ cmpb $0, %cl /* lowest byte NUL? */
+ jz L(2) /* yes => return */
+
+ inc %eax /* increment pointer */
+ testb %ch, %ch /* second byte NUL? */
+
+ jz L(2) /* yes => return */
+
+ shrl $16, %ecx /* make upper bytes accessible */
+ incl %eax /* increment pointer */
+
+ cmpb $0, %cl /* is third byte NUL? */
+ jz L(2) /* yes => return */
+
+ incl %eax /* increment pointer */
+
+L(2): subl STR(%esp), %eax /* now compute the length as difference
+ between start and terminating NUL
+ character */
+ ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..21b5a2742c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,143 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+ and store difference in a third limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_sub_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 12)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 8)
+ movl S2(%esp),%ebx
+ cfi_rel_offset (ebx, 0)
+ movl SIZE(%esp),%ecx
+ movl (%ebx),%ebp
+ cfi_rel_offset (ebp, 4)
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx /* zero carry flag */
+ jz L(end)
+ pushl %edx
+ cfi_adjust_cfa_offset (4)
+
+ ALIGN (3)
+L(oop): movl 28(%edi),%eax /* fetch destination cache line */
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ sbbl %ebp,%eax
+ movl 4(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 8(%ebx),%ebp
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ sbbl %ebp,%eax
+ movl 12(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 16(%ebx),%ebp
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ sbbl %ebp,%eax
+ movl 20(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 24(%ebx),%ebp
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ sbbl %ebp,%eax
+ movl 28(%ebx),%ebp
+ sbbl %ebp,%edx
+ movl 32(%ebx),%ebp
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebx),%ebx
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+ cfi_adjust_cfa_offset (-4)
+L(end):
+ decl %edx /* test %edx w/o clobbering carry */
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ sbbl %ebp,%eax
+ movl 4(%ebx),%ebp
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebx),%ebx
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ sbbl %ebp,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..5e5e121ca2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,94 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+ the result from a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_submul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+
+L(oop): adcl $0, %ebp
+ movl (%s1_ptr,%size,4), %eax
+
+ mull %s2_limb
+
+ addl %ebp, %eax
+ movl (%res_ptr,%size,4), %ebp
+
+ adcl $0, %edx
+ subl %eax, %ebp
+
+ movl %ebp, (%res_ptr,%size,4)
+ incl %size
+
+ movl %edx, %ebp
+ jnz L(oop)
+
+ adcl $0, %ebp
+ movl %ebp, %eax
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/i686/Makefile b/REORG.TODO/sysdeps/i386/i686/Makefile
new file mode 100644
index 0000000000..311042787b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/Makefile
@@ -0,0 +1,12 @@
+# So that we can test __m128's alignment
+stack-align-test-flags += -msse
+
+CFLAGS-.o += -Wa,-mtune=i686
+CFLAGS-.os += -Wa,-mtune=i686
+CFLAGS-.op += -Wa,-mtune=i686
+CFLAGS-.oS += -Wa,-mtune=i686
+
+ASFLAGS-.o += -Wa,-mtune=i686
+ASFLAGS-.os += -Wa,-mtune=i686
+ASFLAGS-.op += -Wa,-mtune=i686
+ASFLAGS-.oS += -Wa,-mtune=i686
diff --git a/REORG.TODO/sysdeps/i386/i686/add_n.S b/REORG.TODO/sysdeps/i386/i686/add_n.S
new file mode 100644
index 0000000000..4afa648ceb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/add_n.S
@@ -0,0 +1,110 @@
+/* Add two limb vectors of the same length > 0 and store sum in a third
+ limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+#ifdef PIC
+L(1): addl (%esp), %eax
+ ret
+#endif
+ENTRY (__mpn_add_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 4)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 0)
+ movl S2(%esp),%edx
+ movl SIZE(%esp),%ecx
+ movl %ecx,%eax
+ shrl $3,%ecx /* compute count for unrolled loop */
+ negl %eax
+ andl $7,%eax /* get index where to start loop */
+ jz L(oop) /* necessary special case for 0 */
+ incl %ecx /* adjust loop count */
+ shll $2,%eax /* adjustment for pointers... */
+ subl %eax,%edi /* ... since they are offset ... */
+ subl %eax,%esi /* ... by a constant when we ... */
+ subl %eax,%edx /* ... enter the loop */
+ shrl $2,%eax /* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC. */
+ leal (L(oop)-L(0)-3)(%eax,%eax,8),%eax
+ call L(1)
+L(0):
+#else
+/* Calculate start address in loop for non-PIC. */
+ leal (L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+ jmp *%eax /* jump into loop */
+ ALIGN (3)
+L(oop): movl (%esi),%eax
+ adcl (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ adcl 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ adcl 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ adcl 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ adcl 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ adcl 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ adcl 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ adcl 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_add_n)
diff --git a/REORG.TODO/sysdeps/i386/i686/bcopy.S b/REORG.TODO/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 0000000000..15ef9419a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/bzero.S b/REORG.TODO/sysdeps/i386/i686/bzero.S
new file mode 100644
index 0000000000..c7898f18e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/bzero.S
@@ -0,0 +1,4 @@
+#define USE_AS_BZERO
+#define memset __bzero
+#include <sysdeps/i386/i686/memset.S>
+weak_alias (__bzero, bzero)
diff --git a/REORG.TODO/sysdeps/i386/i686/dl-hash.h b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
new file mode 100644
index 0000000000..ceda785b32
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/dl-hash.h
@@ -0,0 +1,79 @@
+/* Compute hash alue for given string according to ELF standard.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _DL_HASH_H
+#define _DL_HASH_H 1
+
+
+/* This is the hashing function specified by the ELF ABI. It is highly
+ optimized for the PII processors. Though it will run on i586 it
+ would be much slower than the generic C implementation. So don't
+ use it. */
+static unsigned int
+__attribute__ ((unused))
+_dl_elf_hash (const char *name)
+{
+ unsigned int result;
+ unsigned int temp0;
+ unsigned int temp1;
+
+ __asm__ __volatile__
+ ("movzbl (%1),%2\n\t"
+ "testl %2, %2\n\t"
+ "jz 1f\n\t"
+ "movl %2, %0\n\t"
+ "movzbl 1(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 2(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 3(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl %2, %0\n\t"
+ "movzbl 4(%1), %2\n\t"
+ "jecxz 1f\n\t"
+ "shll $4, %0\n\t"
+ "addl $5, %1\n\t"
+ "addl %2, %0\n\t"
+ "movzbl (%1), %2\n\t"
+ "jecxz 1f\n"
+ "2:\t"
+ "shll $4, %0\n\t"
+ "movl $0xf0000000, %3\n\t"
+ "incl %1\n\t"
+ "addl %2, %0\n\t"
+ "andl %0, %3\n\t"
+ "andl $0x0fffffff, %0\n\t"
+ "shrl $24, %3\n\t"
+ "movzbl (%1), %2\n\t"
+ "xorl %3, %0\n\t"
+ "testl %2, %2\n\t"
+ "jnz 2b\n"
+ "1:\t"
+ : "=&r" (result), "=r" (name), "=&c" (temp0), "=&r" (temp1)
+ : "0" (0), "1" ((const unsigned char *) name));
+
+ return result;
+}
+
+#endif /* dl-hash.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/ffs.c b/REORG.TODO/sysdeps/i386/i686/ffs.c
new file mode 100644
index 0000000000..cbe36ff873
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/ffs.c
@@ -0,0 +1,48 @@
+/* ffs -- find first set bit in a word, counted from least significant end.
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define ffsl __something_else
+#include <string.h>
+
+#undef ffs
+
+#ifdef __GNUC__
+
+int
+__ffs (int x)
+{
+ int cnt;
+ int tmp;
+
+ asm ("bsfl %2,%0\n" /* Count low bits in X and store in %1. */
+ "cmovel %1,%0\n" /* If number was zero, use -1 as result. */
+ : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1));
+
+ return cnt + 1;
+}
+weak_alias (__ffs, ffs)
+libc_hidden_def (__ffs)
+libc_hidden_builtin_def (ffs)
+#undef ffsl
+weak_alias (__ffs, ffsl)
+
+#else
+#include <string/ffs.c>
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 0000000000..73060b088c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_log)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__log_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 0000000000..6fd39d50d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_logf)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__logf_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 0000000000..7e3bc8d817
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,94 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+ .section .rodata.cst8,"aM",@progbits,8
+
+ .p2align 3
+ .type one,@object
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ .type limit,@object
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_logl)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ movzwl 4+8(%esp), %eax
+ cmpl $0xc000, %eax
+ jae 5f // x <= -2, avoid overflow from -LDBL_MAX - 1.
+ fsubl MO(one) // x-1 : x : log(2)
+5: fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2f
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 4f
+ fabs // log(1) is +0 in all rounding modes.
+4: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ fadd %st(0)
+ ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2b
+ fxam
+ fnstsw
+ andb $0x45, %ah
+ cmpb $0x40, %ah
+ jne 6f
+ fabs // log(1) is +0 in all rounding modes.
+6: fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logl_finite)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
new file mode 100644
index 0000000000..7d9089232f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),math)
+libm-sysdep_routines += e_expf-sse2 e_expf-ia32 s_sinf-sse2 s_cosf-sse2 \
+ s_sincosf-sse2
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
new file mode 100644
index 0000000000..b486b4d1ca
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S
@@ -0,0 +1,22 @@
+/*
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define __ieee754_expf __ieee754_expf_ia32
+#define __expf_finite __expf_finite_ia32
+
+#include <sysdeps/i386/fpu/e_expf.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
new file mode 100644
index 0000000000..e6bb6fa289
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S
@@ -0,0 +1,325 @@
+/* SSE2 version of __ieee754_expf and __expf_finite
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ * Let K = 64 (table size).
+ * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ * where
+ * x = m*log(2)/K + y, y in [0.0..log(2)/K]
+ * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
+ * values of 2^(j/K) are tabulated as T[j].
+ *
+ * P(y) is a minimax polynomial approximation of expf(x)-1
+ * on small interval [0.0..log(2)/K].
+ *
+ * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ * __ieee754_expf_sse2(NaN) = NaN
+ * __ieee754_expf_sse2(+INF) = +INF
+ * __ieee754_expf_sse2(-INF) = 0
+ * __ieee754_expf_sse2(x) = 1 for subnormals
+ * for finite argument, only __ieee754_expf_sse2(0)=1 is exact
+ * __ieee754_expf_sse2(x) overflows if x>700
+ * __ieee754_expf_sse2(x) underflows if x<-700
+ *
+ * Note:
+ * For |x|<700, __ieee754_expf_sse2 computes result in double precision,
+ * with accuracy a bit more than needed for expf, and does not round it
+ * to single precision.
+ */
+
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%edx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%edx,reg2,_scale)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+#endif
+
+ .text
+ENTRY(__ieee754_expf_sse2)
+ /* Input: single precision x on stack at address 4(%esp) */
+
+#ifdef PIC
+ LOAD_PIC_REG(dx)
+#endif
+
+ cvtss2sd 4(%esp), %xmm1 /* Convert x to double precision */
+ mov 4(%esp), %ecx /* Copy x */
+ movsd MO1(DP_KLN2), %xmm2 /* DP K/log(2) */
+ movsd MO1(DP_P2), %xmm3 /* DP P2 */
+ movl %ecx, %eax /* x */
+ mulsd %xmm1, %xmm2 /* DP x*K/log(2) */
+ andl $0x7fffffff, %ecx /* |x| */
+ cmpl $0x442f0000, %ecx /* |x|<700 ? */
+ movsd MO1(DP_P3), %xmm4 /* DP P3 */
+ addsd MO1(DP_RS), %xmm2 /* DP x*K/log(2)+RS */
+ jae L(special_paths)
+
+ /* Here if |x|<700 */
+ cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */
+ jb L(small_arg)
+
+ /* Main path: here if 2^(-28)<=|x|<700 */
+ cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */
+ movd %xmm2, %eax /* bits of n*K+j with trash */
+ subss MO1(SP_RS), %xmm2 /* SP t=round(x*K/log(2)) */
+ movl %eax, %ecx /* n*K+j with trash */
+ cvtss2sd %xmm2, %xmm2 /* DP t */
+ andl $0x3f, %eax /* bits of j */
+ mulsd MO1(DP_NLN2K), %xmm2 /* DP -t*log(2)/K */
+ andl $0xffffffc0, %ecx /* bits of n */
+#ifdef __AVX__
+ vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */
+ vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */
+#else
+ addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */
+ movaps %xmm2, %xmm0 /* DP y */
+ mulsd %xmm2, %xmm2 /* DP z=y*y */
+#endif
+ mulsd %xmm2, %xmm4 /* DP P3*z */
+ addl $0xffc0, %ecx /* bits of n + DP exponent bias */
+ mulsd %xmm2, %xmm3 /* DP P2*z */
+ shrl $2, %ecx /* High 2 bytes of DP 2^n */
+ pxor %xmm1, %xmm1 /* clear %xmm1 */
+ addsd MO1(DP_P1), %xmm4 /* DP P3*z+P1 */
+ addsd MO1(DP_P0), %xmm3 /* DP P2*z+P0 */
+ pinsrw $3, %ecx, %xmm1 /* DP 2^n */
+ mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */
+ mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */
+ addsd %xmm4, %xmm0 /* DP P(y) */
+ mulsd MO2(DP_T,%eax,8), %xmm0 /* DP P(y)*T[j] */
+ addsd MO2(DP_T,%eax,8), %xmm0 /* DP T[j]*(P(y)+1) */
+ mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */
+ cvtsd2ss %xmm0, %xmm1
+
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm1, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ lea 4(%esp), %esp /* Return back 4 bytes of stack frame */
+ ret
+
+ .p2align 4
+L(small_arg):
+ /* Here if 0<=|x|<2^(-28) */
+ movss 4(%esp), %xmm0 /* load x */
+ addss MO1(SP_ONE), %xmm0 /* 1.0 + x */
+ /* Return 1.0 with inexact raised, except for x==0 */
+ jmp L(epilogue)
+
+ .p2align 4
+L(special_paths):
+ /* Here if x is NaN, or Inf, or finite |x|>=700 */
+ movss 4(%esp), %xmm0 /* load x */
+
+ cmpl $0x7f800000, %ecx /* |x| is finite ? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=700 */
+ testl $0x80000000, %eax /* sign of x nonzero ? */
+ je L(res_overflow)
+
+ /* Here if finite x<=-700 */
+ movss MO1(SP_SMALL), %xmm0 /* load small value 2^(-100) */
+ mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */
+ jmp L(epilogue)
+
+ .p2align 4
+L(res_overflow):
+ /* Here if finite x>=700 */
+ movss MO1(SP_LARGE), %xmm0 /* load large value 2^100 */
+ mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(arg_nan) /* |x| is Inf ? */
+
+ /* Here if |x| is Inf */
+ shrl $31, %eax /* Get sign bit of x */
+ movss MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_nan):
+ /* Here if |x| is NaN */
+ addss %xmm0, %xmm0 /* Return x+x (raise invalid) */
+
+ .p2align 4
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm0, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ lea 4(%esp), %esp /* Return back 4 bytes of stack frame */
+ ret
+END(__ieee754_expf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
+ .long 0x00000000, 0x3ff00000
+ .long 0x3e778061, 0x3ff02c9a
+ .long 0xd3158574, 0x3ff059b0
+ .long 0x18759bc8, 0x3ff08745
+ .long 0x6cf9890f, 0x3ff0b558
+ .long 0x32d3d1a2, 0x3ff0e3ec
+ .long 0xd0125b51, 0x3ff11301
+ .long 0xaea92de0, 0x3ff1429a
+ .long 0x3c7d517b, 0x3ff172b8
+ .long 0xeb6fcb75, 0x3ff1a35b
+ .long 0x3168b9aa, 0x3ff1d487
+ .long 0x88628cd6, 0x3ff2063b
+ .long 0x6e756238, 0x3ff2387a
+ .long 0x65e27cdd, 0x3ff26b45
+ .long 0xf51fdee1, 0x3ff29e9d
+ .long 0xa6e4030b, 0x3ff2d285
+ .long 0x0a31b715, 0x3ff306fe
+ .long 0xb26416ff, 0x3ff33c08
+ .long 0x373aa9cb, 0x3ff371a7
+ .long 0x34e59ff7, 0x3ff3a7db
+ .long 0x4c123422, 0x3ff3dea6
+ .long 0x21f72e2a, 0x3ff4160a
+ .long 0x6061892d, 0x3ff44e08
+ .long 0xb5c13cd0, 0x3ff486a2
+ .long 0xd5362a27, 0x3ff4bfda
+ .long 0x769d2ca7, 0x3ff4f9b2
+ .long 0x569d4f82, 0x3ff5342b
+ .long 0x36b527da, 0x3ff56f47
+ .long 0xdd485429, 0x3ff5ab07
+ .long 0x15ad2148, 0x3ff5e76f
+ .long 0xb03a5585, 0x3ff6247e
+ .long 0x82552225, 0x3ff66238
+ .long 0x667f3bcd, 0x3ff6a09e
+ .long 0x3c651a2f, 0x3ff6dfb2
+ .long 0xe8ec5f74, 0x3ff71f75
+ .long 0x564267c9, 0x3ff75feb
+ .long 0x73eb0187, 0x3ff7a114
+ .long 0x36cf4e62, 0x3ff7e2f3
+ .long 0x994cce13, 0x3ff82589
+ .long 0x9b4492ed, 0x3ff868d9
+ .long 0x422aa0db, 0x3ff8ace5
+ .long 0x99157736, 0x3ff8f1ae
+ .long 0xb0cdc5e5, 0x3ff93737
+ .long 0x9fde4e50, 0x3ff97d82
+ .long 0x82a3f090, 0x3ff9c491
+ .long 0x7b5de565, 0x3ffa0c66
+ .long 0xb23e255d, 0x3ffa5503
+ .long 0x5579fdbf, 0x3ffa9e6b
+ .long 0x995ad3ad, 0x3ffae89f
+ .long 0xb84f15fb, 0x3ffb33a2
+ .long 0xf2fb5e47, 0x3ffb7f76
+ .long 0x904bc1d2, 0x3ffbcc1e
+ .long 0xdd85529c, 0x3ffc199b
+ .long 0x2e57d14b, 0x3ffc67f1
+ .long 0xdcef9069, 0x3ffcb720
+ .long 0x4a07897c, 0x3ffd072d
+ .long 0xdcfba487, 0x3ffd5818
+ .long 0x03db3285, 0x3ffda9e6
+ .long 0x337b9b5f, 0x3ffdfc97
+ .long 0xe78b3ff6, 0x3ffe502e
+ .long 0xa2a490da, 0x3ffea4af
+ .long 0xee615a27, 0x3ffefa1b
+ .long 0x5b6e4540, 0x3fff5076
+ .long 0x819e90d8, 0x3fffa7c1
+ .type L(DP_T), @object
+ ASM_SIZE_DIRECTIVE(L(DP_T))
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .p2align 3
+L(DP_KLN2): /* double precision K/log(2) */
+ .long 0x652b82fe, 0x40571547
+ .type L(DP_KLN2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_KLN2))
+
+ .p2align 3
+L(DP_NLN2K): /* double precision -log(2)/K */
+ .long 0xfefa39ef, 0xbf862e42
+ .type L(DP_NLN2K), @object
+ ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
+
+ .p2align 3
+L(DP_RS): /* double precision 2^23+2^22 */
+ .long 0x00000000, 0x41680000
+ .type L(DP_RS), @object
+ ASM_SIZE_DIRECTIVE(L(DP_RS))
+
+ .p2align 3
+L(DP_P3): /* double precision polynomial coefficient P3 */
+ .long 0xeb78fa85, 0x3fa56420
+ .type L(DP_P3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P3))
+
+ .p2align 3
+L(DP_P1): /* double precision polynomial coefficient P1 */
+ .long 0x008d6118, 0x3fe00000
+ .type L(DP_P1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P1))
+
+ .p2align 3
+L(DP_P2): /* double precision polynomial coefficient P2 */
+ .long 0xda752d4f, 0x3fc55550
+ .type L(DP_P2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P2))
+
+ .p2align 3
+L(DP_P0): /* double precision polynomial coefficient P0 */
+ .long 0xffffe7c6, 0x3fefffff
+ .type L(DP_P0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_P0))
+
+ .p2align 2
+L(SP_INF_0):
+ .long 0x7f800000 /* single precision Inf */
+ .long 0 /* single precision zero */
+ .type L(SP_INF_0), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INF_0))
+
+ .section .rodata.cst4,"aM",@progbits,4
+ .p2align 2
+L(SP_RS): /* single precision 2^23+2^22 */
+ .long 0x4b400000
+ .type L(SP_RS), @object
+ ASM_SIZE_DIRECTIVE(L(SP_RS))
+
+ .p2align 2
+L(SP_SMALL): /* single precision small value 2^(-100) */
+ .long 0x0d800000
+ .type L(SP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(SP_SMALL))
+
+ .p2align 2
+L(SP_LARGE): /* single precision large value 2^100 */
+ .long 0x71800000
+ .type L(SP_LARGE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_LARGE))
+
+ .p2align 2
+L(SP_ONE): /* single precision 1.0 */
+ .long 0x3f800000
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+strong_alias (__ieee754_expf_sse2, __expf_finite_sse2)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000000..388cf98a39
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c
@@ -0,0 +1,37 @@
+/* Multiple versions of expf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern double __ieee754_expf_sse2 (double);
+extern double __ieee754_expf_ia32 (double);
+
+double __ieee754_expf (double);
+libm_ifunc (__ieee754_expf,
+ HAS_CPU_FEATURE (SSE2)
+ ? __ieee754_expf_sse2
+ : __ieee754_expf_ia32);
+
+extern double __expf_finite_sse2 (double);
+extern double __expf_finite_ia32 (double);
+
+double __expf_finite (double);
+libm_ifunc (__expf_finite,
+ HAS_CPU_FEATURE (SSE2)
+ ? __expf_finite_sse2
+ : __expf_finite_ia32);
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
new file mode 100644
index 0000000000..04bc23b37b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -0,0 +1,2188 @@
+# Begin of automatic generation
+
+# Maximal error of functions:
+Function: "acos":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "acos_downward":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_towardzero":
+ildouble: 2
+ldouble: 2
+
+Function: "acos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "acosh":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 2
+
+Function: "acosh_downward":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_towardzero":
+double: 1
+idouble: 1
+ildouble: 6
+ldouble: 4
+
+Function: "acosh_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 3
+
+Function: "asin":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "asin_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asin_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "asinh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "asinh_downward":
+double: 1
+float: 1
+idouble: 1
+ildouble: 5
+ldouble: 5
+
+Function: "asinh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "asinh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "atan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "atanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "atanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "atanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 3
+
+Function: "atanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "cabs":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cabs_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cacos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacos_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "cacos_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "cacos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacos_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cacosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cacosh_downward":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cacosh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cacosh_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "cacosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "carg":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "carg_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "casin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "casin_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Real part of "casin_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "casin_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Real part of "casin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casin_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Real part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "casinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "casinh_downward":
+double: 5
+float: 3
+idouble: 5
+ifloat: 3
+ildouble: 6
+ldouble: 6
+
+Function: Imaginary part of "casinh_downward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_towardzero":
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "casinh_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "casinh_upward":
+double: 7
+float: 7
+idouble: 7
+ifloat: 7
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "casinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "catan":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catan_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "catan_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catan_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "catanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "catanh":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "catanh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "catanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cbrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "cbrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cbrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccos":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccos_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccos_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccos_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ccosh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ccosh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ccosh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ccosh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ccosh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cexp":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "cexp":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cexp_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cexp_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "cexp_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cexp_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "clog":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog10":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog10":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "clog10_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 8
+ldouble: 8
+
+Function: Imaginary part of "clog10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog10_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "clog10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "clog_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "clog_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "clog_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "clog_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "cos":
+ildouble: 1
+ldouble: 1
+
+Function: "cos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "cos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cos_upward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh":
+double: 1
+float: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: "cosh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "cosh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 3
+
+Function: Real part of "cpow":
+double: 2
+float: 5
+idouble: 2
+ifloat: 5
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "cpow":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "cpow_downward":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "cpow_towardzero":
+double: 5
+float: 8
+idouble: 5
+ifloat: 8
+ildouble: 7
+ldouble: 7
+
+Function: Imaginary part of "cpow_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "cpow_upward":
+double: 4
+float: 1
+idouble: 4
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "cpow_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csin":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+
+Function: Real part of "csin_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csin_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csin_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "csinh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "csinh_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_downward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csinh_upward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "csinh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "csqrt":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "csqrt":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "csqrt_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "csqrt_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "csqrt_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Imaginary part of "ctan":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Real part of "ctan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_towardzero":
+double: 3
+float: 1
+idouble: 3
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: Imaginary part of "ctan_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctan_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctan_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: Imaginary part of "ctanh":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: Real part of "ctanh_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_downward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Real part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: Imaginary part of "ctanh_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Real part of "ctanh_upward":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: Imaginary part of "ctanh_upward":
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "erf":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erf_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "erfc":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "erfc_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "erfc_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "exp":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_downward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp2_upward":
+ildouble: 1
+ldouble: 1
+
+Function: "exp_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "exp_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "exp_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "expm1":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "expm1_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "expm1_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "gamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "gamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "hypot":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_towardzero":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "hypot_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "j0_downward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j0_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 5
+ldouble: 5
+
+Function: "j0_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "j1":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "j1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 4
+ldouble: 4
+
+Function: "j1_towardzero":
+double: 2
+float: 1
+idouble: 2
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "j1_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 3
+ldouble: 3
+
+Function: "jn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_downward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "jn_towardzero":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "jn_upward":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "lgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 7
+ldouble: 7
+
+Function: "lgamma_upward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "log":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log1p":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log1p_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 4
+
+Function: "log1p_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "log2":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_downward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log2_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "log_downward":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "log_upward":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+
+Function: "pow10_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow10_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "pow_downward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_towardzero":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "pow_upward":
+double: 1
+idouble: 1
+ildouble: 4
+ldouble: 4
+
+Function: "sin":
+ildouble: 1
+ldouble: 1
+
+Function: "sin_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sin_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sin_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos":
+ildouble: 1
+ldouble: 1
+
+Function: "sincos_downward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sincos_towardzero":
+double: 1
+idouble: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sincos_upward":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "sinh":
+double: 1
+ildouble: 2
+ldouble: 2
+
+Function: "sinh_downward":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "sinh_towardzero":
+double: 2
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 4
+
+Function: "sinh_upward":
+double: 4
+float: 2
+idouble: 1
+ifloat: 1
+ildouble: 4
+ldouble: 5
+
+Function: "tan":
+float: 1
+ifloat: 1
+ildouble: 2
+ldouble: 2
+
+Function: "tan_downward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_towardzero":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "tan_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "tanh":
+double: 1
+idouble: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_downward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 7
+ldouble: 4
+
+Function: "tanh_towardzero":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 3
+ldouble: 3
+
+Function: "tanh_upward":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 5
+ldouble: 4
+
+Function: "tgamma":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_downward":
+double: 3
+float: 4
+idouble: 3
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_towardzero":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "tgamma_upward":
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
+
+Function: "y0":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+
+Function: "y0_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y0_upward":
+double: 1
+float: 2
+idouble: 1
+ifloat: 2
+ildouble: 3
+ldouble: 3
+
+Function: "y1":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 2
+ldouble: 2
+
+Function: "y1_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 7
+ldouble: 7
+
+Function: "y1_towardzero":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "y1_upward":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
+ildouble: 7
+ldouble: 7
+
+Function: "yn":
+double: 2
+float: 3
+idouble: 2
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+Function: "yn_downward":
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
+ildouble: 5
+ldouble: 5
+
+Function: "yn_towardzero":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 5
+ldouble: 5
+
+Function: "yn_upward":
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
+
+# end of automatic generation
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
new file mode 100644
index 0000000000..193dd704b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name
@@ -0,0 +1 @@
+i686
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
new file mode 100644
index 0000000000..f37850d0b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S
@@ -0,0 +1,553 @@
+/* Optimized with sse2 version of cosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x| == 0: return 1.0-|x|.
+ * 2) if |x| < 2^-27: return 1.0-|x|.
+ * 3) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ * 4) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * s = (-1.0)^((n>>2)&1)
+ * if(n&2 != 0) {
+ * using cos(t) polynomial for |t|<Pi/4, result is
+ * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ * } else {
+ * using sin(t) polynomial for |t|<Pi/4, result is
+ * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
+ * t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * cos(+-0) = 1 not raising inexact,
+ * cos(subnormal) raises inexact,
+ * cos(min_normalized) raises inexact,
+ * cos(normalized) raises inexact,
+ * cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * cos(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+#endif
+
+ .text
+ENTRY(__cosf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ addl $2, %eax /* n */
+ subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t */
+ testl $2, %eax /* n&2 != 0? */
+ jz L(sin_poly)
+
+/*L(cos_poly):*/
+ /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+ */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_C4), %xmm4 /* C4 */
+ mulsd %xmm0, %xmm4 /* z*C4 */
+ movsd MO1(DP_C3), %xmm3 /* C3 */
+ mulsd %xmm0, %xmm3 /* z*C3 */
+ addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */
+ mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */
+ mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */
+
+ addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+
+ mulsd MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(sin_poly):
+ /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+ */
+
+ movaps %xmm0, %xmm4 /* t */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_S4), %xmm2 /* S4 */
+ mulsd %xmm0, %xmm2 /* z*S4 */
+ movsd MO1(DP_S3), %xmm3 /* S3 */
+ mulsd %xmm0, %xmm3 /* z*S3 */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */
+ mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */
+ mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+ mulsd MO2(DP_ONES,%eax,8), %xmm4
+ addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulsd %xmm4, %xmm3
+ /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ addl $2, %eax /* n */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+ subl $68, %eax
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ /* clear unneeded remainder from %ah */
+ andl $0xff, %eax
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $3, %eax /* n=k+3 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ mulsd %xmm0, %xmm0 /* y=x^2 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=x^4 */
+ movsd MO1(DP_C4), %xmm3 /* C4 */
+ mulsd %xmm0, %xmm3 /* z*C4 */
+ movsd MO1(DP_C3), %xmm5 /* C3 */
+ mulsd %xmm0, %xmm5 /* z*C3 */
+ addsd MO1(DP_C2), %xmm3 /* C2+z*C4 */
+ mulsd %xmm0, %xmm3 /* z*(C2+z*C4) */
+ addsd MO1(DP_C1), %xmm5 /* C1+z*C3 */
+ mulsd %xmm0, %xmm5 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm3 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm3 /* y*(C0+z*(C2+z*C4)) */
+ addsd %xmm5, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm3, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 4(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ movsd MO1(DP_COS2_1), %xmm3 /* DP DP_COS2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_1 */
+ addsd MO1(DP_COS2_0), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
+ /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
+ addsd MO1(DP_ONES), %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_less_2pn27):
+ /* Here if |x|<2^-27 */
+ movss ARG_X, %xmm0 /* x */
+ andps MO1(SP_ABS_MASK),%xmm0 /* |x| */
+ movss MO1(SP_ONE), %xmm3 /* 1.0 */
+ subss %xmm0, %xmm3 /* result is 1.0-|x| */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ movss ARG_X, %xmm3 /* load x */
+ subss %xmm3, %xmm3 /* Result is NaN */
+ jmp L(epilogue)
+END(__cosf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5. */
+ .p2align 3
+L(DP_COS2_0):
+ .long 0xff5cc6fd,0xbfdfffff
+ .type L(DP_COS2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
+
+ .p2align 3
+L(DP_COS2_1):
+ .long 0xb178dac5,0x3fa55514
+ .type L(DP_COS2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE),@object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_S3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .type L(DP_S3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+ .p2align 3
+L(DP_S1):
+ .long 0x10c2688b,0x3f811111
+ .type L(DP_S1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+ .p2align 3
+L(DP_S4):
+ .long 0x1674b58a,0xbe5a947e
+ .type L(DP_S4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+ .p2align 3
+L(DP_S2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .type L(DP_S2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+ .p2align 3
+L(DP_S0):
+ .long 0x55551cd9,0xbfc55555
+ .type L(DP_S0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_C3):
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_C3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+ .p2align 3
+L(DP_C1):
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_C1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+ .p2align 3
+L(DP_C4):
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_C4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+ .p2align 3
+L(DP_C2):
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_C2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+ .p2align 3
+L(DP_C0):
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_C0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+ .p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+ .long 0x7fffffff,0x7fffffff
+ .long 0x7fffffff,0x7fffffff
+ .type L(SP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+ .p2align 2
+L(SP_ONE):
+ .long 0x3f800000 /* 1.0 */
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias (__cosf, cosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
new file mode 100644
index 0000000000..af588de9dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c
@@ -0,0 +1,29 @@
+/* Multiple versions of cosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern float __cosf_sse2 (float);
+extern float __cosf_ia32 (float);
+float __cosf (float);
+
+libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32);
+weak_alias (__cosf, cosf);
+
+#define COSF __cosf_ia32
+#include <sysdeps/ieee754/flt-32/s_cosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
new file mode 100644
index 0000000000..f31a925522
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S
@@ -0,0 +1,586 @@
+/* Optimized with sse2 version of sincosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x|==0: sin(x)=x,
+ * cos(x)=1.
+ * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed,
+ * cos(x)=1-|x|.
+ * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1,
+ * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1
+ * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * sign_sin = sign(x) * (-1.0)^(( n >>2)&1)
+ * sign_cos = (-1.0)^(((n+2)>>2)&1)
+ * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t
+ * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s
+ * if(n&2 != 0) {
+ * using cos(t) and sin(t) polynomials for |t|<Pi/4, results are
+ * cos(x) = poly_sin * sign_cos
+ * sin(x) = poly_cos * sign_sin
+ * } else {
+ * sin(x) = poly_sin * sign_sin
+ * cos(x) = poly_cos * sign_cos
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction:
+ * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * sin/cos(+-0) = +-0/1 not raising inexact/underflow,
+ * sin/cos(subnormal) raises inexact/underflow,
+ * sin/cos(min_normalized) raises inexact/underflow,
+ * sin/cos(normalized) raises inexact,
+ * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * sin/cos(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+# define ARG_SIN_PTR 12(%esp)
+# define ARG_COS_PTR 16(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+# define ARG_SIN_PTR 8(%esp)
+# define ARG_COS_PTR 12(%esp)
+#endif
+
+ .text
+ENTRY(__sincosf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+ /* pointer to sin result on stack at address ARG_SIN_PTR */
+ /* pointer to cos result on stack at address ARG_COS_PTR */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4 ? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4 ? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ movl ARG_X, %ecx /* Load x */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ subsd MO2(PIO4J,%edx,8), %xmm0/* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+
+ movaps %xmm0, %xmm4 /* t */
+ movhpd MO1(DP_ONES), %xmm4 /* 1|t */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ movl $2, %edx
+ unpcklpd %xmm0, %xmm0 /* y|y */
+ addl %eax, %edx /* k+2 */
+ movaps %xmm0, %xmm1 /* y|y */
+ mulpd %xmm0, %xmm0 /* z=t^4|z=t^4 */
+
+ movaps MO1(DP_SC4), %xmm2 /* S4 */
+ mulpd %xmm0, %xmm2 /* z*S4 */
+ movaps MO1(DP_SC3), %xmm3 /* S3 */
+ mulpd %xmm0, %xmm3 /* z*S3 */
+ xorl %eax, %ecx /* (sign_x ^ (k>>2))<<2 */
+ addpd MO1(DP_SC2), %xmm2 /* S2+z*S4 */
+ mulpd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ shrl $2, %edx /* (k+2)>>2 */
+ addpd MO1(DP_SC1), %xmm3 /* S1+z*S3 */
+ mulpd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ shrl $2, %ecx /* sign_x ^ k>>2 */
+ addpd MO1(DP_SC0), %xmm2 /* S0+z*(S2+z*S4) */
+ andl $1, %edx /* sign_cos = ((k+2)>>2)&1 */
+ mulpd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ andl $1, %ecx /* sign_sin = sign_x ^ ((k>>2)&1) */
+ addpd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulpd %xmm4, %xmm3 /*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+ testl $2, %eax /* n&2 != 0 ? */
+ addpd %xmm4, %xmm3 /*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+ jnz L(sin_result_sin_poly)
+
+/*L(sin_result_cos_poly):*/
+ /*
+ * Here if
+ * cos(x) = poly_sin * sign_cos
+ * sin(x) = poly_cos * sign_sin
+ */
+ movsd MO2(DP_ONES,%ecx,8), %xmm4/* 0|sign_sin */
+ movhpd MO2(DP_ONES,%edx,8), %xmm4/* sign_cos|sign_sin */
+ mulpd %xmm4, %xmm3 /* result_cos|result_sin */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(sin_result_sin_poly):
+ /*
+ * Here if
+ * sin(x) = poly_sin * sign_sin
+ * cos(x) = poly_cos * sign_cos
+ */
+ movsd MO2(DP_ONES,%edx,8), %xmm4/* 0|sign_cos */
+ movhpd MO2(DP_ONES,%ecx,8), %xmm4/* sign_sin|sign_cos */
+ mulpd %xmm4, %xmm3 /* result_sin|result_cos */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%ecx) /* store cos(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move sin(x) to xmm0[0] */
+ movss %xmm0, (%eax) /* store sin(x) */
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN ? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23 ? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movl ARG_X, %ecx /* Load x */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ subl $68, %eax /* bitpos=eb-0x7f+59, where 0x7f */
+ /*is exponent bias */
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ andl $0xff, %eax /* clear unneeded remainder from %ah*/
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3/* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5/* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0/* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1/* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19 ? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ movl ARG_X, %ecx /* Load x */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5 ? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3/* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ shrl $29, %ecx /* (sign of x) << 2 */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $1, %eax /* n=k+1 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5 ? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ movaps %xmm0, %xmm3 /* DP x */
+ movhpd MO1(DP_ONES), %xmm3 /* DP 1|x */
+ mulsd %xmm0, %xmm0 /* DP y=x^2 */
+ unpcklpd %xmm0, %xmm0 /* DP y|y */
+ movaps %xmm0, %xmm1 /* y|y */
+ mulpd %xmm0, %xmm0 /* z=x^4|z=x^4 */
+
+ movapd MO1(DP_SC4), %xmm4 /* S4 */
+ mulpd %xmm0, %xmm4 /* z*S4 */
+ movapd MO1(DP_SC3), %xmm5 /* S3 */
+ mulpd %xmm0, %xmm5 /* z*S3 */
+ addpd MO1(DP_SC2), %xmm4 /* S2+z*S4 */
+ mulpd %xmm0, %xmm4 /* z*(S2+z*S4) */
+ addpd MO1(DP_SC1), %xmm5 /* S1+z*S3 */
+ mulpd %xmm0, %xmm5 /* z*(S1+z*S3) */
+ addpd MO1(DP_SC0), %xmm4 /* S0+z*(S2+z*S4) */
+ mulpd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */
+ mulpd %xmm3, %xmm5 /* x*z*(S1+z*S3) */
+ mulpd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */
+ addpd %xmm5, %xmm4 /*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/
+ movl ARG_SIN_PTR, %eax
+ addpd %xmm4, %xmm3 /*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/
+ movl ARG_COS_PTR, %ecx
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27 ? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ movaps %xmm0, %xmm1 /* DP x */
+ movhpd MO1(DP_ONES), %xmm1 /* DP 1|x */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ unpcklpd %xmm0, %xmm0 /* DP x^2|x^2 */
+
+ movaps MO1(DP_SINCOS2_1), %xmm3/* DP DP_SIN2_1 */
+ mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */
+ addpd MO1(DP_SINCOS2_0), %xmm3/* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+ mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+ mulpd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ addpd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ movl ARG_SIN_PTR, %eax
+ cvtpd2ps %xmm3, %xmm0 /* SP results */
+ movl ARG_COS_PTR, %ecx
+ movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */
+ shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */
+ movss %xmm0, (%ecx) /* store cos(x) */
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn27):
+ movss ARG_X, %xmm7 /* SP x */
+ cmpl $0, %eax /* x=0 ? */
+ je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */
+ /* Here if |x|<2^-27 */
+ /*
+ * Special cases here:
+ * sin(subnormal) raises inexact/underflow
+ * sin(min_normalized) raises inexact/underflow
+ * sin(normalized) raises inexact
+ * cos(here)=1-|x| (raising inexact)
+ */
+ movaps %xmm0, %xmm3 /* DP x */
+ mulsd MO1(DP_SMALL), %xmm0 /* DP x*DP_SMALL */
+ subsd %xmm0, %xmm3 /* DP sin result is x-x*DP_SMALL */
+ andps MO1(SP_ABS_MASK), %xmm7 /* SP |x| */
+ cvtsd2ss %xmm3, %xmm0 /* sin(x) */
+ movl ARG_SIN_PTR, %eax
+ movss MO1(SP_ONE), %xmm1 /* SP 1.0 */
+ movss %xmm0, (%eax) /* sin(x) store */
+ movl ARG_COS_PTR, %ecx
+ subss %xmm7, %xmm1 /* cos(x) */
+ movss %xmm1, (%ecx) /* cos(x) store */
+ RETURN
+
+ .p2align 4
+L(arg_zero):
+ movss MO1(SP_ONE), %xmm0 /* 1.0 */
+ movl ARG_SIN_PTR, %eax
+ movl ARG_COS_PTR, %ecx
+ movss %xmm7, (%eax) /* sin(+-0)==x */
+ movss %xmm0, (%ecx) /* cos(+-0)==1 */
+ RETURN
+
+ .p2align 4
+L(arg_inf_or_nan):
+ movss ARG_X, %xmm7 /* SP x */
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ subss %xmm7, %xmm7 /* x-x, result is NaN */
+ movl ARG_SIN_PTR, %eax
+ movl ARG_COS_PTR, %ecx
+ movss %xmm7, (%eax)
+ movss %xmm7, (%ecx)
+ RETURN
+END(__sincosf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomials for */
+/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low DP part, */
+/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */
+/* for |x|<2^-5. */
+ .p2align 4
+L(DP_SINCOS2_0):
+ .long 0x5543d49d,0xbfc55555
+ .long 0xff5cc6fd,0xbfdfffff
+ .type L(DP_SINCOS2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0))
+
+ .p2align 4
+L(DP_SINCOS2_1):
+ .long 0x75cec8c5,0x3f8110f4
+ .long 0xb178dac5,0x3fa55514
+ .type L(DP_SINCOS2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomials for */
+/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low DP part, */
+/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */
+/* for |t|<Pi/4. */
+ .p2align 4
+L(DP_SC4):
+ .long 0x1674b58a,0xbe5a947e
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_SC4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC4))
+
+ .p2align 4
+L(DP_SC3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_SC3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC3))
+
+ .p2align 4
+L(DP_SC2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_SC2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC2))
+
+ .p2align 4
+L(DP_SC1):
+ .long 0x10c2688b,0x3f811111
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_SC1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC1))
+
+ .p2align 4
+L(DP_SC0):
+ .long 0x55551cd9,0xbfc55555
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_SC0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SC0))
+
+ .p2align 3
+L(DP_SMALL):
+ .long 0x00000000,0x3cd00000 /* 2^(-50) */
+ .type L(DP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+ .p2align 4
+L(SP_ABS_MASK): /* Mask for getting SP absolute value */
+ .long 0x7fffffff,0x7fffffff
+ .long 0x7fffffff,0x7fffffff
+ .type L(SP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
+
+ .p2align 2
+L(SP_ONE):
+ .long 0x3f800000 /* 1.0 */
+ .type L(SP_ONE), @object
+ ASM_SIZE_DIRECTIVE(L(SP_ONE))
+
+weak_alias(__sincosf, sincosf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..9428f9b4ea
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,30 @@
+/* Multiple versions of sincosf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern void __sincosf_sse2 (float, float *, float *);
+extern void __sincosf_ia32 (float, float *, float *);
+void __sincosf (float, float *, float *);
+
+libm_ifunc (__sincosf,
+ HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32);
+weak_alias (__sincosf, sincosf);
+
+#define SINCOSF __sincosf_ia32
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
new file mode 100644
index 0000000000..ee96018061
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S
@@ -0,0 +1,566 @@
+/* Optimized with sse2 version of sinf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define __need_Emath
+#include <bits/errno.h>
+
+/* Short algorithm description:
+ *
+ * 1) if |x| == 0: return x.
+ * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ * 5) if |x| < 9*Pi/4:
+ * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 5.2) Reconstruction:
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * if(n&2 != 0) {
+ * using cos(t) polynomial for |t|<Pi/4, result is
+ * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ * } else {
+ * using sin(t) polynomial for |t|<Pi/4, result is
+ * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ * }
+ * 6) if |x| < 2^23, large args:
+ * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 6.2) Reconstruction same as (5.2).
+ * 7) if |x| >= 2^23, very large args:
+ * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ * t=|x|-j*Pi/4.
+ * 7.2) Reconstruction same as (5.2).
+ * 8) if x is Inf, return x-x, and set errno=EDOM.
+ * 9) if x is NaN, return x-x.
+ *
+ * Special cases:
+ * sin(+-0) = +-0 not raising inexact/underflow,
+ * sin(subnormal) raises inexact/underflow,
+ * sin(min_normalized) raises inexact/underflow,
+ * sin(normalized) raises inexact,
+ * sin(Inf) = NaN, raises invalid, sets errno to EDOM,
+ * sin(NaN) = NaN.
+ */
+
+#ifdef PIC
+# define MO1(symbol) L(symbol)##@GOTOFF(%ebx)
+# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale)
+# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
+# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG)
+# define PUSH(REG) pushl REG; CFI_PUSH(REG)
+# define POP(REG) popl REG; CFI_POP(REG)
+# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx)
+# define ARG_X 8(%esp)
+#else
+# define MO1(symbol) L(symbol)
+# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale)
+# define ENTRANCE
+# define RETURN ret
+# define ARG_X 4(%esp)
+#endif
+
+ .text
+ENTRY(__sinf_sse2)
+ /* Input: single precision x on stack at address ARG_X */
+
+ ENTRANCE
+ movl ARG_X, %eax /* Bits of x */
+ cvtss2sd ARG_X, %xmm0 /* DP x */
+ andl $0x7fffffff, %eax /* |x| */
+
+ cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */
+ jb L(arg_less_pio4)
+
+ /* Here if |x|>=Pi/4 */
+ movd %eax, %xmm3 /* SP |x| */
+ andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */
+ movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */
+
+ cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */
+ jae L(large_args)
+
+ /* Here if Pi/4<=|x|<9*Pi/4 */
+ mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */
+ movl ARG_X, %ecx /* Load x */
+ cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */
+ shrl $31, %ecx /* sign of x */
+ addl $1, %eax /* k+1 */
+ movl $0x0e, %edx
+ andl %eax, %edx /* j = (k+1)&0x0e */
+ subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
+
+L(reconstruction):
+ /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */
+ testl $2, %eax /* n&2 != 0? */
+ jz L(sin_poly)
+
+/*L(cos_poly):*/
+ /* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
+ */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_C4), %xmm4 /* C4 */
+ mulsd %xmm0, %xmm4 /* z*C4 */
+ xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */
+ movsd MO1(DP_C3), %xmm3 /* C3 */
+ mulsd %xmm0, %xmm3 /* z*C3 */
+ addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */
+ mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */
+ mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */
+ addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */
+ mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */
+
+ addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
+ addsd MO1(DP_ONES), %xmm3
+
+ mulsd MO2(DP_ONES,%ecx,8), %xmm3 /* DP result */
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(sin_poly):
+ /* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4:
+ * y = t*t; z = y*y;
+ * s = sign(x) * (-1.0)^((n>>2)&1)
+ * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
+ */
+
+ movaps %xmm0, %xmm4 /* t */
+ shrl $2, %eax /* n>>2 */
+ mulsd %xmm0, %xmm0 /* y=t^2 */
+ andl $1, %eax /* (n>>2)&1 */
+ movaps %xmm0, %xmm1 /* y */
+ xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */
+ mulsd %xmm0, %xmm0 /* z=t^4 */
+
+ movsd MO1(DP_S4), %xmm2 /* S4 */
+ mulsd %xmm0, %xmm2 /* z*S4 */
+ movsd MO1(DP_S3), %xmm3 /* S3 */
+ mulsd %xmm0, %xmm3 /* z*S3 */
+ lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */
+ addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */
+ mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */
+ mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */
+ /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
+ mulsd MO2(DP_ONES,%ecx,8), %xmm4
+ addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ mulsd %xmm4, %xmm3
+ /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ movsd %xmm3, 0(%esp) /* Move result from sse... */
+ fldl 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 8(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(large_args):
+ /* Here if |x|>=9*Pi/4 */
+ cmpl $0x7f800000, %eax /* x is Inf or NaN? */
+ jae L(arg_inf_or_nan)
+
+ /* Here if finite |x|>=9*Pi/4 */
+ cmpl $0x4b000000, %eax /* |x|<2^23? */
+ jae L(very_large_args)
+
+ /* Here if 9*Pi/4<=|x|<2^23 */
+ movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */
+ mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */
+ cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */
+ addl $1, %eax /* k+1 */
+ movl %eax, %edx
+ andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */
+ cvtsi2sdl %edx, %xmm4 /* DP j */
+ movl ARG_X, %ecx /* Load x */
+ movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */
+ shrl $31, %ecx /* sign bit of x */
+ mulsd %xmm4, %xmm2 /* -j*PIO4HI */
+ movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */
+ addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */
+ mulsd %xmm3, %xmm4 /* j*PIO4LO */
+ addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */
+ jmp L(reconstruction)
+
+ .p2align 4
+L(very_large_args):
+ /* Here if finite |x|>=2^23 */
+
+ /* bitpos = (ix>>23) - BIAS_32 + 59; */
+ shrl $23, %eax /* eb = biased exponent of x */
+ /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
+ subl $68, %eax
+ movl $28, %ecx /* %cl=28 */
+ movl %eax, %edx /* bitpos copy */
+
+ /* j = bitpos/28; */
+ div %cl /* j in register %al=%ax/%cl */
+ movapd %xmm0, %xmm3 /* |x| */
+ /* clear unneeded remainder from %ah */
+ andl $0xff, %eax
+
+ imull $28, %eax, %ecx /* j*28 */
+ movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */
+ movapd %xmm0, %xmm5 /* |x| */
+ mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */
+ movapd %xmm0, %xmm1 /* |x| */
+ mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */
+ mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */
+ addl $19, %ecx /* j*28+19 */
+ mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */
+ cmpl %ecx, %edx /* bitpos>=j*28+19? */
+ jl L(very_large_skip1)
+
+ /* Here if bitpos>=j*28+19 */
+ andpd %xmm3, %xmm4 /* HI(tmp3) */
+ subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */
+L(very_large_skip1):
+
+ movsd MO1(DP_2POW52), %xmm6
+ movapd %xmm5, %xmm2 /* tmp2 copy */
+ addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */
+ movl $1, %edx
+ addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */
+ movsd 8+MO1(DP_2POW52), %xmm4
+ movd %xmm6, %eax /* k = I64_LO(tmp6); */
+ addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */
+ movl ARG_X, %ecx /* Load x */
+ comisd %xmm5, %xmm4 /* tmp4 > tmp5? */
+ jbe L(very_large_skip2)
+
+ /* Here if tmp4 > tmp5 */
+ subl $1, %eax /* k-- */
+ addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */
+L(very_large_skip2):
+
+ andl %eax, %edx /* k&1 */
+ subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */
+ addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */
+ addsd %xmm2, %xmm3 /* t += tmp2 */
+ shrl $31, %ecx /* sign of x */
+ addsd %xmm3, %xmm0 /* t += tmp0 */
+ addl $1, %eax /* n=k+1 */
+ addsd %xmm1, %xmm0 /* t += tmp1 */
+ mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */
+
+ jmp L(reconstruction) /* end of very_large_args peth */
+
+ .p2align 4
+L(arg_less_pio4):
+ /* Here if |x|<Pi/4 */
+ cmpl $0x3d000000, %eax /* |x|<2^-5? */
+ jl L(arg_less_2pn5)
+
+ /* Here if 2^-5<=|x|<Pi/4 */
+ movaps %xmm0, %xmm3 /* x */
+ mulsd %xmm0, %xmm0 /* y=x^2 */
+ movaps %xmm0, %xmm1 /* y */
+ mulsd %xmm0, %xmm0 /* z=x^4 */
+ movsd MO1(DP_S4), %xmm4 /* S4 */
+ mulsd %xmm0, %xmm4 /* z*S4 */
+ movsd MO1(DP_S3), %xmm5 /* S3 */
+ mulsd %xmm0, %xmm5 /* z*S3 */
+ addsd MO1(DP_S2), %xmm4 /* S2+z*S4 */
+ mulsd %xmm0, %xmm4 /* z*(S2+z*S4) */
+ addsd MO1(DP_S1), %xmm5 /* S1+z*S3 */
+ mulsd %xmm0, %xmm5 /* z*(S1+z*S3) */
+ addsd MO1(DP_S0), %xmm4 /* S0+z*(S2+z*S4) */
+ mulsd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */
+ mulsd %xmm3, %xmm5 /* x*z*(S1+z*S3) */
+ mulsd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */
+ /* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm5, %xmm4
+ /* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
+ addsd %xmm4, %xmm3
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+
+L(epilogue):
+ lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */
+ movss %xmm3, 0(%esp) /* Move result from sse... */
+ flds 0(%esp) /* ...to FPU. */
+ /* Return back 4 bytes of stack frame */
+ lea 4(%esp), %esp
+ RETURN
+
+ .p2align 4
+L(arg_less_2pn5):
+ /* Here if |x|<2^-5 */
+ cmpl $0x32000000, %eax /* |x|<2^-27? */
+ jl L(arg_less_2pn27)
+
+ /* Here if 2^-27<=|x|<2^-5 */
+ movaps %xmm0, %xmm1 /* DP x */
+ mulsd %xmm0, %xmm0 /* DP x^2 */
+ movsd MO1(DP_SIN2_1), %xmm3 /* DP DP_SIN2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */
+ addsd MO1(DP_SIN2_0), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */
+ mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */
+ mulsd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ addsd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */
+ cvtsd2ss %xmm3, %xmm3 /* SP result */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_less_2pn27):
+ movss ARG_X, %xmm3 /* SP x */
+ cmpl $0, %eax /* x=0? */
+ je L(epilogue) /* in case x=0 return sin(+-0)==+-0 */
+ /* Here if |x|<2^-27 */
+ /*
+ * Special cases here:
+ * sin(subnormal) raises inexact/underflow
+ * sin(min_normalized) raises inexact/underflow
+ * sin(normalized) raises inexact
+ */
+ movaps %xmm0, %xmm3 /* Copy of DP x */
+ mulsd MO1(DP_SMALL), %xmm0 /* x*DP_SMALL */
+ subsd %xmm0, %xmm3 /* Result is x-x*DP_SMALL */
+ cvtsd2ss %xmm3, %xmm3 /* Result converted to SP */
+ jmp L(epilogue)
+
+ .p2align 4
+L(arg_inf_or_nan):
+ /* Here if |x| is Inf or NAN */
+ jne L(skip_errno_setting) /* in case of x is NaN */
+
+ /* Here if x is Inf. Set errno to EDOM. */
+ call JUMPTARGET(__errno_location)
+ movl $EDOM, (%eax)
+
+ .p2align 4
+L(skip_errno_setting):
+ /* Here if |x| is Inf or NAN. Continued. */
+ movss ARG_X, %xmm3 /* load x */
+ subss %xmm3, %xmm3 /* Result is NaN */
+ jmp L(epilogue)
+END(__sinf_sse2)
+
+ .section .rodata, "a"
+ .p2align 3
+L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
+ .long 0x00000000,0x00000000
+ .long 0x54442d18,0x3fe921fb
+ .long 0x54442d18,0x3ff921fb
+ .long 0x7f3321d2,0x4002d97c
+ .long 0x54442d18,0x400921fb
+ .long 0x2955385e,0x400f6a7a
+ .long 0x7f3321d2,0x4012d97c
+ .long 0xe9bba775,0x4015fdbb
+ .long 0x54442d18,0x401921fb
+ .long 0xbeccb2bb,0x401c463a
+ .long 0x2955385e,0x401f6a7a
+ .type L(PIO4J), @object
+ ASM_SIZE_DIRECTIVE(L(PIO4J))
+
+ .p2align 3
+L(_FPI): /* 4/Pi broken into sum of positive DP values */
+ .long 0x00000000,0x00000000
+ .long 0x6c000000,0x3ff45f30
+ .long 0x2a000000,0x3e3c9c88
+ .long 0xa8000000,0x3c54fe13
+ .long 0xd0000000,0x3aaf47d4
+ .long 0x6c000000,0x38fbb81b
+ .long 0xe0000000,0x3714acc9
+ .long 0x7c000000,0x3560e410
+ .long 0x56000000,0x33bca2c7
+ .long 0xac000000,0x31fbd778
+ .long 0xe0000000,0x300b7246
+ .long 0xe8000000,0x2e5d2126
+ .long 0x48000000,0x2c970032
+ .long 0xe8000000,0x2ad77504
+ .long 0xe0000000,0x290921cf
+ .long 0xb0000000,0x274deb1c
+ .long 0xe0000000,0x25829a73
+ .long 0xbe000000,0x23fd1046
+ .long 0x10000000,0x2224baed
+ .long 0x8e000000,0x20709d33
+ .long 0x80000000,0x1e535a2f
+ .long 0x64000000,0x1cef904e
+ .long 0x30000000,0x1b0d6398
+ .long 0x24000000,0x1964ce7d
+ .long 0x16000000,0x17b908bf
+ .type L(_FPI), @object
+ ASM_SIZE_DIRECTIVE(L(_FPI))
+
+/* Coefficients of polynomial
+ for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */
+ .p2align 3
+L(DP_SIN2_0):
+ .long 0x5543d49d,0xbfc55555
+ .type L(DP_SIN2_0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SIN2_0))
+
+ .p2align 3
+L(DP_SIN2_1):
+ .long 0x75cec8c5,0x3f8110f4
+ .type L(DP_SIN2_1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SIN2_1))
+
+ .p2align 3
+L(DP_ZERONE):
+ .long 0x00000000,0x00000000 /* 0.0 */
+ .long 0x00000000,0xbff00000 /* 1.0 */
+ .type L(DP_ZERONE), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
+
+ .p2align 3
+L(DP_ONES):
+ .long 0x00000000,0x3ff00000 /* +1.0 */
+ .long 0x00000000,0xbff00000 /* -1.0 */
+ .type L(DP_ONES), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ONES))
+
+/* Coefficients of polynomial
+ for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_S3):
+ .long 0x64e6b5b4,0x3ec71d72
+ .type L(DP_S3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S3))
+
+ .p2align 3
+L(DP_S1):
+ .long 0x10c2688b,0x3f811111
+ .type L(DP_S1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S1))
+
+ .p2align 3
+L(DP_S4):
+ .long 0x1674b58a,0xbe5a947e
+ .type L(DP_S4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S4))
+
+ .p2align 3
+L(DP_S2):
+ .long 0x8b4bd1f9,0xbf2a019f
+ .type L(DP_S2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S2))
+
+ .p2align 3
+L(DP_S0):
+ .long 0x55551cd9,0xbfc55555
+ .type L(DP_S0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_S0))
+
+ .p2align 3
+L(DP_SMALL):
+ .long 0x00000000,0x3cd00000 /* 2^(-50) */
+ .type L(DP_SMALL), @object
+ ASM_SIZE_DIRECTIVE(L(DP_SMALL))
+
+/* Coefficients of polynomial
+ for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */
+ .p2align 3
+L(DP_C3):
+ .long 0x9ac43cc0,0x3efa00eb
+ .type L(DP_C3), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C3))
+
+ .p2align 3
+L(DP_C1):
+ .long 0x545c50c7,0x3fa55555
+ .type L(DP_C1), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C1))
+
+ .p2align 3
+L(DP_C4):
+ .long 0xdd8844d7,0xbe923c97
+ .type L(DP_C4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C4))
+
+ .p2align 3
+L(DP_C2):
+ .long 0x348b6874,0xbf56c16b
+ .type L(DP_C2), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C2))
+
+ .p2align 3
+L(DP_C0):
+ .long 0xfffe98ae,0xbfdfffff
+ .type L(DP_C0), @object
+ ASM_SIZE_DIRECTIVE(L(DP_C0))
+
+ .p2align 3
+L(DP_PIO4):
+ .long 0x54442d18,0x3fe921fb /* Pi/4 */
+ .type L(DP_PIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4))
+
+ .p2align 3
+L(DP_2POW52):
+ .long 0x00000000,0x43300000 /* +2^52 */
+ .long 0x00000000,0xc3300000 /* -2^52 */
+ .type L(DP_2POW52), @object
+ ASM_SIZE_DIRECTIVE(L(DP_2POW52))
+
+ .p2align 3
+L(DP_INVPIO4):
+ .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */
+ .type L(DP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
+
+ .p2align 3
+L(DP_PIO4HI):
+ .long 0x54000000,0xbfe921fb /* High part of Pi/4 */
+ .type L(DP_PIO4HI), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
+
+ .p2align 3
+L(DP_PIO4LO):
+ .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */
+ .type L(DP_PIO4LO), @object
+ ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
+
+ .p2align 2
+L(SP_INVPIO4):
+ .long 0x3fa2f983 /* 4/Pi */
+ .type L(SP_INVPIO4), @object
+ ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
+
+ .p2align 4
+L(DP_ABS_MASK): /* Mask for getting DP absolute value */
+ .long 0xffffffff,0x7fffffff
+ .long 0xffffffff,0x7fffffff
+ .type L(DP_ABS_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
+
+ .p2align 3
+L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
+ .long 0x00000000,0xffffffff
+ .type L(DP_HI_MASK), @object
+ ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
+
+weak_alias (__sinf, sinf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
new file mode 100644
index 0000000000..8ccdd2f34d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c
@@ -0,0 +1,28 @@
+/* Multiple versions of sinf
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern float __sinf_sse2 (float);
+extern float __sinf_ia32 (float);
+float __sinf (float);
+
+libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32);
+weak_alias (__sinf, sinf);
+#define SINF __sinf_ia32
+#include <sysdeps/ieee754/flt-32/s_sinf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
new file mode 100644
index 0000000000..ace8db9410
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmax)
+ fldl 4(%esp) // x
+ fldl 12(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fxch
+
+ fucomi %st(1), %st
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmax)
+weak_alias (__fmax, fmax)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..3a25951a09
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S
@@ -0,0 +1,39 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxf)
+ flds 4(%esp) // x
+ flds 8(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fxch
+
+ fucomi %st(1), %st
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmaxf)
+weak_alias (__fmaxf, fmaxf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..3f6c21c63d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S
@@ -0,0 +1,58 @@
+/* Compute maximum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmaxl)
+ fldt 4(%esp) // x
+ fldt 16(%esp) // x : y
+
+ fucomi %st(1), %st
+ jp 2f
+ fcmovb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+
+2: // Unordered.
+ fucomi %st(0), %st
+ jp 3f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) may or may not be.
+ fxch
+ fucomi %st(0), %st
+ jp 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fmaxl)
+weak_alias (__fmaxl, fmaxl)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
new file mode 100644
index 0000000000..72d306fd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fmin)
+ fldl 4(%esp) // x
+ fldl 12(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fucomi %st(1), %st
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fmin)
+weak_alias (__fmin, fmin)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
new file mode 100644
index 0000000000..52ea892bad
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S
@@ -0,0 +1,37 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminf)
+ flds 4(%esp) // x
+ flds 8(%esp) // x : y
+
+ fucomi %st(0), %st
+ fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise
+
+ fucomi %st(1), %st
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+END(__fminf)
+weak_alias (__fminf, fminf)
diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
new file mode 100644
index 0000000000..e1cb83fed7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S
@@ -0,0 +1,58 @@
+/* Compute minimum of two numbers, regarding NaN as missing argument.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY(__fminl)
+ fldt 4(%esp) // x
+ fldt 16(%esp) // x : y
+
+ fucomi %st(1), %st
+ jp 2f
+ fcmovnb %st(1), %st
+
+ fstp %st(1)
+
+ ret
+
+2: // Unordered.
+ fucomi %st(0), %st
+ jp 3f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 11(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+3: // st(0) is a NaN; st(1) may or may not be.
+ fxch
+ fucomi %st(0), %st
+ jp 4f
+ // st(1) is a NaN; st(0) is not. Test if st(1) is signaling.
+ testb $0x40, 23(%esp)
+ jz 4f
+ fstp %st(1)
+ ret
+
+4: // Both arguments are NaNs, or one is a signaling NaN.
+ faddp
+ ret
+END(__fminl)
+weak_alias (__fminl, fminl)
diff --git a/REORG.TODO/sysdeps/i386/i686/hp-timing.h b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
new file mode 100644
index 0000000000..1b11410feb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/hp-timing.h
@@ -0,0 +1,42 @@
+/* High precision, low overhead timing functions. i686 version.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _HP_TIMING_H
+#define _HP_TIMING_H 1
+
+/* We always assume having the timestamp register. */
+#define HP_TIMING_AVAIL (1)
+#define HP_SMALL_TIMING_AVAIL (1)
+
+/* We indeed have inlined functions. */
+#define HP_TIMING_INLINE (1)
+
+/* We use 64bit values for the times. */
+typedef unsigned long long int hp_timing_t;
+
+/* That's quite simple. Use the `rdtsc' instruction. Note that the value
+ might not be 100% accurate since there might be some more instructions
+ running in this moment. This could be changed by using a barrier like
+ 'cpuid' right before the `rdtsc' instruciton. But we are not interested
+ in accurate clock cycles here so we don't do this. */
+#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("rdtsc" : "=A" (Var))
+
+#include <hp-timing-common.h>
+
+#endif /* hp-timing.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/init-arch.h b/REORG.TODO/sysdeps/i386/i686/init-arch.h
new file mode 100644
index 0000000000..f55f80efa0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MINIMUM_ISA 686
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/memcmp.S b/REORG.TODO/sysdeps/i386/i686/memcmp.S
new file mode 100644
index 0000000000..5140ee2145
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcmp.S
@@ -0,0 +1,408 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* Preserve EBX. */
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+#define ENTRANCE pushl %ebx; cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (ebx, 0)
+#define RETURN popl %ebx; cfi_adjust_cfa_offset (-4); \
+ cfi_restore (ebx); ret
+
+/* Load an entry in a jump table into EBX. TABLE is a jump table
+ with relative offsets. INDEX is a register contains the index
+ into the jump table. */
+#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,4), %ebx
+
+ .text
+ ALIGN (4)
+ENTRY (memcmp)
+ ENTRANCE
+
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+ cmpl $1, %ecx
+ jne L(not_1)
+ movzbl (%eax), %ecx /* LEN == 1 */
+ cmpb (%edx), %cl
+ jne L(neq)
+L(bye):
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+L(neq):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+L(not_1):
+ jl L(bye) /* LEN == 0 */
+
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ movl %eax, %esi
+ cfi_rel_offset (esi, 0)
+ cmpl $32, %ecx;
+ jge L(32bytesormore) /* LEN => 32 */
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+ ALIGN (4)
+L(28bytes):
+ movl -28(%esi), %eax
+ movl -28(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%esi), %eax
+ movl -24(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%esi), %eax
+ movl -20(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%esi), %eax
+ movl -16(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%esi), %eax
+ movl -12(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%esi), %eax
+ movl -8(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%esi), %eax
+ movl -4(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(29bytes):
+ movl -29(%esi), %eax
+ movl -29(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%esi), %eax
+ movl -25(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%esi), %eax
+ movl -21(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%esi), %eax
+ movl -17(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%esi), %eax
+ movl -13(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%esi), %eax
+ movl -9(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%esi), %eax
+ movl -5(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(30bytes):
+ movl -30(%esi), %eax
+ movl -30(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%esi), %eax
+ movl -26(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%esi), %eax
+ movl -22(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%esi), %eax
+ movl -18(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%esi), %eax
+ movl -14(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%esi), %eax
+ movl -10(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%esi), %eax
+ movl -6(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%esi), %eax
+ movzwl -2(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+L(31bytes):
+ movl -31(%esi), %eax
+ movl -31(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%esi), %eax
+ movl -27(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%esi), %eax
+ movl -23(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%esi), %eax
+ movl -19(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%esi), %eax
+ movl -15(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%esi), %eax
+ movl -11(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%esi), %eax
+ movl -7(%edx), %ecx
+ cmpl %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%esi), %eax
+ movzwl -3(%edx), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmpl %ecx, %eax
+ jne L(set)
+ movzbl -1(%esi), %eax
+ cmpb -1(%edx), %al
+ jne L(set)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ xorl %eax, %eax
+ RETURN
+
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (esi, 0)
+ cfi_rel_offset (ebx, 4)
+ ALIGN (4)
+/* ECX >= 32. */
+L(32bytesormore):
+ subl $32, %ecx
+
+ movl (%esi), %eax
+ cmpl (%edx), %eax
+ jne L(load_ecx)
+
+ movl 4(%esi), %eax
+ cmpl 4(%edx), %eax
+ jne L(load_ecx_4)
+
+ movl 8(%esi), %eax
+ cmpl 8(%edx), %eax
+ jne L(load_ecx_8)
+
+ movl 12(%esi), %eax
+ cmpl 12(%edx), %eax
+ jne L(load_ecx_12)
+
+ movl 16(%esi), %eax
+ cmpl 16(%edx), %eax
+ jne L(load_ecx_16)
+
+ movl 20(%esi), %eax
+ cmpl 20(%edx), %eax
+ jne L(load_ecx_20)
+
+ movl 24(%esi), %eax
+ cmpl 24(%edx), %eax
+ jne L(load_ecx_24)
+
+ movl 28(%esi), %eax
+ cmpl 28(%edx), %eax
+ jne L(load_ecx_28)
+
+ addl $32, %esi
+ addl $32, %edx
+ cmpl $32, %ecx
+ jge L(32bytesormore)
+
+ LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx)
+ addl %ecx, %edx
+ addl %ecx, %esi
+ jmp *%ebx
+
+L(load_ecx_28):
+ addl $0x4, %edx
+L(load_ecx_24):
+ addl $0x4, %edx
+L(load_ecx_20):
+ addl $0x4, %edx
+L(load_ecx_16):
+ addl $0x4, %edx
+L(load_ecx_12):
+ addl $0x4, %edx
+L(load_ecx_8):
+ addl $0x4, %edx
+L(load_ecx_4):
+ addl $0x4, %edx
+L(load_ecx):
+ movl (%edx), %ecx
+
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpb %ch, %ah
+ jne L(set)
+ shrl $16,%eax
+ shrl $16,%ecx
+ cmpb %cl, %al
+ jne L(set)
+ /* We get there only if we already know there is a
+ difference. */
+ cmpl %ecx, %eax
+L(set):
+ sbbl %eax, %eax
+ sbbl $-1, %eax
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ RETURN
+END (memcmp)
+
+ .section .rodata
+ ALIGN (2)
+L(table_32bytes) :
+ .long L(0bytes) - L(table_32bytes)
+ .long L(1bytes) - L(table_32bytes)
+ .long L(2bytes) - L(table_32bytes)
+ .long L(3bytes) - L(table_32bytes)
+ .long L(4bytes) - L(table_32bytes)
+ .long L(5bytes) - L(table_32bytes)
+ .long L(6bytes) - L(table_32bytes)
+ .long L(7bytes) - L(table_32bytes)
+ .long L(8bytes) - L(table_32bytes)
+ .long L(9bytes) - L(table_32bytes)
+ .long L(10bytes) - L(table_32bytes)
+ .long L(11bytes) - L(table_32bytes)
+ .long L(12bytes) - L(table_32bytes)
+ .long L(13bytes) - L(table_32bytes)
+ .long L(14bytes) - L(table_32bytes)
+ .long L(15bytes) - L(table_32bytes)
+ .long L(16bytes) - L(table_32bytes)
+ .long L(17bytes) - L(table_32bytes)
+ .long L(18bytes) - L(table_32bytes)
+ .long L(19bytes) - L(table_32bytes)
+ .long L(20bytes) - L(table_32bytes)
+ .long L(21bytes) - L(table_32bytes)
+ .long L(22bytes) - L(table_32bytes)
+ .long L(23bytes) - L(table_32bytes)
+ .long L(24bytes) - L(table_32bytes)
+ .long L(25bytes) - L(table_32bytes)
+ .long L(26bytes) - L(table_32bytes)
+ .long L(27bytes) - L(table_32bytes)
+ .long L(28bytes) - L(table_32bytes)
+ .long L(29bytes) - L(table_32bytes)
+ .long L(30bytes) - L(table_32bytes)
+ .long L(31bytes) - L(table_32bytes)
+
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/memcpy.S b/REORG.TODO/sysdeps/i386/i686/memcpy.S
new file mode 100644
index 0000000000..1d61447430
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memcpy.S
@@ -0,0 +1,98 @@
+/* Copy memory block and return pointer to beginning of destination block
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memcpy_chk)
+#endif
+ENTRY (memcpy)
+
+ movl %edi, %eax
+ movl DEST(%esp), %edi
+ movl %esi, %edx
+ movl SRC(%esp), %esi
+
+ movl %edi, %ecx
+ xorl %esi, %ecx
+ andl $3, %ecx
+ movl LEN(%esp), %ecx
+ cld
+ jne .Lunaligned
+
+ cmpl $3, %ecx
+ jbe .Lunaligned
+
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+ testl $3, %esi
+ je 1f
+ movsb
+ decl %ecx
+1: pushl %eax
+ movl %ecx, %eax
+ shrl $2, %ecx
+ andl $3, %eax
+ rep
+ movsl
+ movl %eax, %ecx
+ rep
+ movsb
+ popl %eax
+
+.Lend: movl %eax, %edi
+ movl %edx, %esi
+ movl DEST(%esp), %eax
+
+ ret
+
+ /* When we come here the pointers do not have the same
+ alignment or the length is too short. No need to optimize for
+ aligned memory accesses. */
+.Lunaligned:
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ jmp .Lend
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memmove.S b/REORG.TODO/sysdeps/i386/i686/memmove.S
new file mode 100644
index 0000000000..f60c3d501b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memmove.S
@@ -0,0 +1,120 @@
+/* Copy memory block and return pointer to beginning of destination block
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 2003.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* one spilled register */
+#define RTN PARMS
+
+ .text
+
+#ifdef USE_AS_BCOPY
+# define SRC RTN
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST RTN
+# define SRC DEST+4
+# define LEN SRC+4
+
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memmove_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memmove_chk)
+# endif
+#endif
+
+ENTRY (memmove)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+
+ movl LEN(%esp), %ecx
+ movl DEST(%esp), %edi
+ cfi_rel_offset (edi, 0)
+ movl %esi, %edx
+ movl SRC(%esp), %esi
+ cfi_register (esi, edx)
+
+ movl %edi, %eax
+ subl %esi, %eax
+ cmpl %eax, %ecx
+ ja 3f
+
+ cld
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ movl %edx, %esi
+ cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+#endif
+
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+ cfi_register (esi, edx)
+
+ /* Backward copying. */
+3: std
+ leal -1(%edi, %ecx), %edi
+ leal -1(%esi, %ecx), %esi
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: subl $1, %edi
+ subl $1, %esi
+ shrl $1, %ecx
+ jnc 2f
+ movsw
+2: subl $2, %edi
+ subl $2, %esi
+ rep
+ movsl
+ movl %edx, %esi
+ cfi_restore (esi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+#endif
+
+ cld
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memmove)
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (memmove)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
new file mode 100644
index 0000000000..31cb4efdb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/mempcpy.S
@@ -0,0 +1,65 @@
+/* Copy memory block and return pointer to following byte.
+ For Intel 80x86, x>=6.
+ This file is part of the GNU C Library.
+ Copyright (C) 1998-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+#if defined PIC && IS_IN (libc)
+ENTRY_CHK (__mempcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__mempcpy_chk)
+#endif
+ENTRY (__mempcpy)
+
+ movl LEN(%esp), %ecx
+ movl %edi, %eax
+ cfi_register (edi, eax)
+ movl DEST(%esp), %edi
+ movl %esi, %edx
+ cfi_register (esi, edx)
+ movl SRC(%esp), %esi
+ cld
+ shrl $1, %ecx
+ jnc 1f
+ movsb
+1: shrl $1, %ecx
+ jnc 2f
+ movsw
+2: rep
+ movsl
+ xchgl %edi, %eax
+ cfi_restore (edi)
+ movl %edx, %esi
+ cfi_restore (esi)
+
+ ret
+END (__mempcpy)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/memset.S b/REORG.TODO/sysdeps/i386/i686/memset.S
new file mode 100644
index 0000000000..24d06178d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memset.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+ Highly optimized version for ix86, x>=6.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+#else
+# define RTN PARMS
+# define DEST RTN
+# define CHR DEST+4
+# define LEN CHR+4
+#endif
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY_CHK (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk)
+#endif
+ENTRY (memset)
+
+ cld
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ movl DEST(%esp), %edx
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xorl %eax, %eax /* fill with 0 */
+#else
+ movzbl CHR(%esp), %eax
+#endif
+ jecxz 1f
+ movl %edx, %edi
+ cfi_rel_offset (edi, 0)
+ andl $3, %edx
+ jz 2f /* aligned */
+ jp 3f /* misaligned at 3, store just one byte below */
+ stosb /* misaligned at 1 or 2, store two bytes */
+ decl %ecx
+ jz 1f
+3: stosb
+ decl %ecx
+ jz 1f
+ xorl $1, %edx
+ jnz 2f /* was misaligned at 2 or 3, now aligned */
+ stosb /* was misaligned at 1, store third byte */
+ decl %ecx
+2: movl %ecx, %edx
+ shrl $2, %ecx
+ andl $3, %edx
+#ifndef USE_AS_BZERO
+ imul $0x01010101, %eax
+#endif
+ rep
+ stosl
+ movl %edx, %ecx
+ rep
+ stosb
+
+1:
+#ifndef USE_AS_BZERO
+ movl DEST(%esp), %eax /* start address of destination is result */
+#endif
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (memset)
+libc_hidden_builtin_def (memset)
+
+#if defined SHARED && IS_IN (libc) && !defined __memset_chk \
+ && !defined USE_AS_BZERO
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/memusage.h b/REORG.TODO/sysdeps/i386/i686/memusage.h
new file mode 100644
index 0000000000..77a020d7c0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/memusage.h
@@ -0,0 +1,21 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high))
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
new file mode 100644
index 0000000000..4a0c20c051
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile
@@ -0,0 +1,44 @@
+ifeq ($(subdir),csu)
+tests += test-multiarch
+endif
+
+ifeq ($(subdir),string)
+gen-as-const-headers += locale-defines.sym
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+ memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
+ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
+ memcmp-ssse3 memcmp-sse4 varshift \
+ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
+ strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
+ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
+ memchr-sse2 memchr-sse2-bsf \
+ memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+ rawmemchr-sse2 rawmemchr-sse2-bsf \
+ strnlen-sse2 strnlen-c \
+ strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
+ strncase_l-c strncase-c strncase_l-ssse3 \
+ strcasecmp_l-sse4 strncase_l-sse4 \
+ bcopy-sse2-unaligned memcpy-sse2-unaligned \
+ mempcpy-sse2-unaligned memmove-sse2-unaligned \
+ strcspn-c strpbrk-c strspn-c
+CFLAGS-varshift.c += -msse4
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+ wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
+endif
+
+ifeq ($(subdir),math)
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000000..efef2a10dd
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 0000000000..cbc8b420e8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 0000000000..36aac44b9c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 0000000000..877f82c28f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,59 @@
+/* Multiple versions of bcopy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(bcopy)
+ .type bcopy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
+2: ret
+END(bcopy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bcopy_ia32, @function; \
+ .p2align 4; \
+ .globl __bcopy_ia32; \
+ .hidden __bcopy_ia32; \
+ __bcopy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 0000000000..507b288bb3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 0000000000..8d04512e4e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 0000000000..9dac490aa2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,62 @@
+/* Multiple versions of bzero
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__bzero)
+ .type __bzero, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bzero_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
+2: ret
+END(__bzero)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bzero_ia32, @function; \
+ .p2align 4; \
+ .globl __bzero_ia32; \
+ .hidden __bzero_ia32; \
+ __bzero_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e8026a2a78
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -0,0 +1,376 @@
+/* Enumerate available IFUNC implementations of a function. i686 version.
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ifunc-impl-list.h>
+#include "init-arch.h"
+
+/* Maximum number of IFUNC implementations. */
+#define MAX_IFUNC 4
+
+/* Fill ARRAY of MAX elements with IFUNC implementations for function
+ NAME and return the number of valid entries. */
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ assert (max >= MAX_IFUNC);
+
+ size_t i = 0;
+
+ /* Support sysdeps/i386/i686/multiarch/bcopy.S. */
+ IFUNC_IMPL (i, name, bcopy,
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+ __bcopy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3),
+ __bcopy_ssse3)
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
+ __bcopy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/bzero.S. */
+ IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+ __bzero_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
+ __bzero_sse2)
+ IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memchr.S. */
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+ __memchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2),
+ __memchr_sse2)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memcmp.S. */
+ IFUNC_IMPL (i, name, memcmp,
+ IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2),
+ __memcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+ __memcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */
+ IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memmove_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memmove_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memmove_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memmove.S. */
+ IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+ __memmove_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+ __memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
+ __memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memrchr.S. */
+ IFUNC_IMPL (i, name, memrchr,
+ IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+ __memrchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2),
+ __memrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */
+ IFUNC_IMPL (i, name, __memset_chk,
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memset_chk_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memset.S. */
+ IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+ __memset_sse2_rep)
+ IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
+ __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */
+ IFUNC_IMPL (i, name, rawmemchr,
+ IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+ __rawmemchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2),
+ __rawmemchr_sse2)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/stpncpy.S. */
+ IFUNC_IMPL (i, name, stpncpy,
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
+ __stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2),
+ __stpncpy_sse2)
+ IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/stpcpy.S. */
+ IFUNC_IMPL (i, name, stpcpy,
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+ __stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2),
+ __stpcpy_sse2)
+ IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcasecmp.S. */
+ IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strcasecmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ HAS_CPU_FEATURE (SSSE3),
+ __strcasecmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S. */
+ IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strcasecmp_l_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ HAS_CPU_FEATURE (SSSE3),
+ __strcasecmp_l_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+ __strcasecmp_l_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcat.S. */
+ IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+ __strcat_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2),
+ __strcat_sse2)
+ IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strchr.S. */
+ IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+ __strchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2),
+ __strchr_sse2)
+ IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcmp.S. */
+ IFUNC_IMPL (i, name, strcmp,
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+ __strcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+ __strcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcpy.S. */
+ IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+ __strcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2),
+ __strcpy_sse2)
+ IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strcspn.S. */
+ IFUNC_IMPL (i, name, strcspn,
+ IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
+ __strcspn_sse42)
+ IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncase.S. */
+ IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strncasecmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ HAS_CPU_FEATURE (SSSE3),
+ __strncasecmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
+ __strncasecmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncase_l.S. */
+ IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ HAS_CPU_FEATURE (SSE4_2),
+ __strncasecmp_l_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ HAS_CPU_FEATURE (SSSE3),
+ __strncasecmp_l_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
+ __strncasecmp_l_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncat.S. */
+ IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+ __strncat_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2),
+ __strncat_sse2)
+ IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncpy.S. */
+ IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+ __strncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2),
+ __strncpy_sse2)
+ IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strnlen.S. */
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2),
+ __strnlen_sse2)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strpbrk.S. */
+ IFUNC_IMPL (i, name, strpbrk,
+ IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
+ __strpbrk_sse42)
+ IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strrchr.S. */
+ IFUNC_IMPL (i, name, strrchr,
+ IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+ __strrchr_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2),
+ __strrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strspn.S. */
+ IFUNC_IMPL (i, name, strspn,
+ IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+ __strspn_sse42)
+ IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2),
+ __wcschr_sse2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcscmp.S. */
+ IFUNC_IMPL (i, name, wcscmp,
+ IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2),
+ __wcscmp_sse2)
+ IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcscpy.S. */
+ IFUNC_IMPL (i, name, wcscpy,
+ IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+ __wcscpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcslen.S. */
+ IFUNC_IMPL (i, name, wcslen,
+ IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2),
+ __wcslen_sse2)
+ IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wcsrchr.S. */
+ IFUNC_IMPL (i, name, wcsrchr,
+ IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2),
+ __wcsrchr_sse2)
+ IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/wmemcmp.S. */
+ IFUNC_IMPL (i, name, wmemcmp,
+ IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2),
+ __wmemcmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+ __wmemcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32))
+
+#ifdef SHARED
+ /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */
+ IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memcpy_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __memcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __memcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/memcpy.S. */
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+ __memcpy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+ __memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
+ __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */
+ IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_chk_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_CPU_FEATURE (SSE2),
+ __mempcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */
+ IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_ssse3_rep)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+ __mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
+ __mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+ __strlen_sse2_bsf)
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2),
+ __strlen_sse2)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32))
+
+ /* Support sysdeps/i386/i686/multiarch/strncmp.S. */
+ IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+ __strncmp_sse4_2)
+ IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+ __strncmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32))
+#endif
+
+ return i;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
new file mode 100644
index 0000000000..aebff9a4f9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym
@@ -0,0 +1,11 @@
+#include <locale/localeinfo.h>
+#include <langinfo.h>
+#include <stddef.h>
+
+--
+
+LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
+LC_CTYPE
+_NL_CTYPE_NONASCII_CASE
+LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
+SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000000..dd316486e6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,502 @@
+/* Optimized memchr with sse2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2_bsf
+# endif
+
+ .text
+ENTRY (MEMCHR)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null_1)
+# endif
+ mov %ecx, %eax
+
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+
+ cmp $48, %ecx
+ ja L(crosscache)
+
+ movdqu (%eax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ je L(unaligned_no_match_1)
+/* Check which byte is a match. */
+ bsf %ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %ecx, %edx
+ jbe L(return_null_1)
+# endif
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+ sub $16, %edx
+ jbe L(return_null_1)
+ PUSH (%edi)
+ lea 16(%eax), %edi
+ and $15, %eax
+ and $-16, %edi
+ add %eax, %edx
+# else
+ lea 16(%eax), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(return_null_1):
+ xor %eax, %eax
+ ret
+
+# ifndef USE_AS_RAWMEMCHR
+ CFI_POP (%edi)
+# endif
+
+ .p2align 4
+L(crosscache):
+/* Handle unaligned string. */
+
+# ifndef USE_AS_RAWMEMCHR
+ PUSH (%edi)
+ mov %eax, %edi
+ and $15, %ecx
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ mov %eax, %edx
+ and $15, %ecx
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ sar %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+/* Check which byte is a match. */
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %eax, %edx
+ jbe L(return_null)
+ add %edi, %eax
+ add %ecx, %eax
+ RETURN
+# else
+ add %edx, %eax
+ add %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ edx = ecx + edx
+ edx |= -(edx < ecx) */
+ add %ecx, %edx
+ sbb %eax, %eax
+ or %eax, %edx
+ sub $16, %edx
+ jbe L(return_null)
+ add $16, %edi
+# else
+ add $16, %edx
+# endif
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ test $0x3f, %edi
+# else
+ test $0x3f, %edx
+# endif
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm3
+# else
+ movdqa 48(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+
+ pcmpeqb %xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ pmovmskb %xmm1, %eax
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 48(%edi, %eax), %eax
+ RETURN
+# else
+ lea 48(%edx, %eax), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 16(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ xor %eax, %eax
+ RETURN
+# endif
+ .p2align 4
+L(matches0):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea -16(%eax, %edi), %eax
+ RETURN
+# else
+ lea -16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ add %edi, %eax
+ RETURN
+# else
+ add %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches16):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 16(%eax, %edi), %eax
+ RETURN
+# else
+ lea 16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches32):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 32(%eax, %edi), %eax
+ RETURN
+# else
+ lea 32(%eax, %edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(matches_1):
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ add %edi, %eax
+ RETURN
+
+ .p2align 4
+L(matches16_1):
+ sub $16, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 16(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches32_1):
+ sub $32, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 32(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches48_1):
+ sub $48, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 48(%edi, %eax), %eax
+ RETURN
+# endif
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000000..172d70de13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,709 @@
+/* Optimized memchr with sse2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+# define ENTRANCE PUSH(%edi);
+# define PARMS 8
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# else
+# define ENTRANCE
+# define PARMS 4
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2
+# endif
+
+ atom_text_section
+ENTRY (MEMCHR)
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null)
+# endif
+
+ punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov %ecx, %edi
+# else
+ mov %ecx, %edx
+# endif
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+ cmp $48, %ecx
+ ja L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqu (%edi), %xmm0
+# else
+ movdqu (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog)
+
+ sub $16, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+ and $15, %ecx
+ and $-16, %edi
+ add %ecx, %edx
+# else
+ jnz L(match_case1_prolog)
+ lea 16(%edx), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ sar %cl, %eax
+ test %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog1)
+ /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using
+ "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
+ possible addition overflow. */
+ neg %ecx
+ add $16, %ecx
+ sub %ecx, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+# else
+ jnz L(match_case1_prolog1)
+ lea 16(%edx), %edx
+# endif
+
+ .p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%edi), %xmm0
+# else
+ lea 64(%edx), %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ lea 64(%edx), %edx
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ xor %ecx, %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ pmovmskb %xmm2, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm1, %eax
+ lea 16(%ecx), %ecx
+
+ .p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+ add %ecx, %edi
+# else
+L(match_case1_prolog1):
+ add %ecx, %edx
+L(match_case1_prolog):
+# endif
+ test %al, %al
+ jz L(match_case1_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case1_8)
+ test $0x01, %al
+ jnz L(ExitCase1_1)
+ test $0x02, %al
+ jnz L(ExitCase1_2)
+ test $0x04, %al
+ jnz L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+ lea 3(%edi), %eax
+ RETURN
+# else
+ lea 3(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_8):
+ test $0x10, %al
+ jnz L(ExitCase1_5)
+ test $0x20, %al
+ jnz L(ExitCase1_6)
+ test $0x40, %al
+ jnz L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+ lea 7(%edi), %eax
+ RETURN
+# else
+ lea 7(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case1_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase1_9)
+ test $0x02, %ah
+ jnz L(ExitCase1_10)
+ test $0x04, %ah
+ jnz L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+ lea 11(%edi), %eax
+ RETURN
+# else
+ lea 11(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase1_13)
+ test $0x20, %ah
+ jnz L(ExitCase1_14)
+ test $0x40, %ah
+ jnz L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+ lea 15(%edi), %eax
+ RETURN
+# else
+ lea 15(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $32, %edx
+ jbe L(return_null)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+
+ xor %eax, %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %eax
+ RETURN
+# else
+ mov %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+ lea 1(%edi), %eax
+ RETURN
+# else
+ lea 1(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+ lea 2(%edi), %eax
+ RETURN
+# else
+ lea 2(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+ lea 4(%edi), %eax
+ RETURN
+# else
+ lea 4(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+ lea 5(%edi), %eax
+ RETURN
+# else
+ lea 5(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+ lea 6(%edi), %eax
+ RETURN
+# else
+ lea 6(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+ lea 8(%edi), %eax
+ RETURN
+# else
+ lea 8(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+ lea 9(%edi), %eax
+ RETURN
+# else
+ lea 9(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+ lea 10(%edi), %eax
+ RETURN
+# else
+ lea 10(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+ lea 12(%edi), %eax
+ RETURN
+# else
+ lea 12(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+ lea 13(%edi), %eax
+ RETURN
+# else
+ lea 13(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+ lea 14(%edi), %eax
+ RETURN
+# else
+ lea 14(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(match_case2):
+ sub %ecx, %edx
+L(match_case2_prolog1):
+ add %ecx, %edi
+L(match_case2_prolog):
+ test %al, %al
+ jz L(match_case2_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case2_8)
+ test $0x01, %al
+ jnz L(ExitCase2_1)
+ test $0x02, %al
+ jnz L(ExitCase2_2)
+ test $0x04, %al
+ jnz L(ExitCase2_3)
+ sub $4, %edx
+ jb L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_8):
+ test $0x10, %al
+ jnz L(ExitCase2_5)
+ test $0x20, %al
+ jnz L(ExitCase2_6)
+ test $0x40, %al
+ jnz L(ExitCase2_7)
+ sub $8, %edx
+ jb L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case2_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase2_9)
+ test $0x02, %ah
+ jnz L(ExitCase2_10)
+ test $0x04, %ah
+ jnz L(ExitCase2_11)
+ sub $12, %edx
+ jb L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase2_13)
+ test $0x20, %ah
+ jnz L(ExitCase2_14)
+ test $0x40, %ah
+ jnz L(ExitCase2_15)
+ sub $16, %edx
+ jb L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_1):
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_2):
+ sub $2, %edx
+ jb L(return_null)
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_3):
+ sub $3, %edx
+ jb L(return_null)
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_5):
+ sub $5, %edx
+ jb L(return_null)
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_6):
+ sub $6, %edx
+ jb L(return_null)
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_7):
+ sub $7, %edx
+ jb L(return_null)
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_9):
+ sub $9, %edx
+ jb L(return_null)
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_10):
+ sub $10, %edx
+ jb L(return_null)
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_11):
+ sub $11, %edx
+ jb L(return_null)
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_13):
+ sub $13, %edx
+ jb L(return_null)
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_14):
+ sub $14, %edx
+ jb L(return_null)
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_15):
+ sub $15, %edx
+ jb L(return_null)
+ lea 14(%edi), %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000000..bd0dace290
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__memchr)
+ .type __memchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX ( __memchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__memchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
+ ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memchr_ia32, @function; \
+ .globl __memchr_ia32; \
+ .p2align 4; \
+ __memchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..2aa13048b2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -0,0 +1,1225 @@
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_2
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1 + 4
+# define LEN BLK2 + 4
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjusted EDX/ESI. Go. */ \
+ jmp *%ebx
+# else
+# define JMPTBL(I, B) I
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+# else
+ cmp $1, %ecx
+ jbe L(less1bytes)
+# endif
+
+ pxor %xmm0, %xmm0
+ cmp $64, %ecx
+ ja L(64bytesormore)
+ cmp $8, %ecx
+
+# ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+# else
+ jb L(less8bytes)
+ PUSH (%ebx)
+# endif
+
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %bl
+ cmpb (%edx), %bl
+ jne L(nonzero)
+
+ mov 1(%eax), %bl
+ cmpb 1(%edx), %bl
+ jne L(nonzero)
+
+ cmp $2, %ecx
+ jz L(0bytes)
+
+ mov 2(%eax), %bl
+ cmpb 2(%edx), %bl
+ jne L(nonzero)
+
+ cmp $3, %ecx
+ jz L(0bytes)
+
+ mov 3(%eax), %bl
+ cmpb 3(%edx), %bl
+ jne L(nonzero)
+
+ cmp $4, %ecx
+ jz L(0bytes)
+
+ mov 4(%eax), %bl
+ cmpb 4(%edx), %bl
+ jne L(nonzero)
+
+ cmp $5, %ecx
+ jz L(0bytes)
+
+ mov 5(%eax), %bl
+ cmpb 5(%edx), %bl
+ jne L(nonzero)
+
+ cmp $6, %ecx
+ jz L(0bytes)
+
+ mov 6(%eax), %bl
+ cmpb 6(%edx), %bl
+ je L(0bytes)
+
+L(nonzero):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(above)
+ neg %eax
+L(above):
+ ret
+ CFI_PUSH (%ebx)
+# endif
+
+ .p2align 4
+L(0bytes):
+ POP (%ebx)
+ xor %eax, %eax
+ ret
+
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(0bytesend)
+ movzbl (%eax), %eax
+ movzbl (%edx), %edx
+ sub %edx, %eax
+ ret
+
+ .p2align 4
+L(0bytesend):
+ xor %eax, %eax
+ ret
+# endif
+ .p2align 4
+L(64bytesormore):
+ PUSH (%ebx)
+ mov %ecx, %ebx
+ mov $64, %ecx
+ sub $64, %ebx
+L(64bytesormore_loop):
+ movdqu (%eax), %xmm1
+ movdqu (%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_16diff)
+
+ movdqu 16(%eax), %xmm1
+ movdqu 16(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_32diff)
+
+ movdqu 32(%eax), %xmm1
+ movdqu 32(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_48diff)
+
+ movdqu 48(%eax), %xmm1
+ movdqu 48(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_64diff)
+ add %ecx, %eax
+ add %ecx, %edx
+ sub %ecx, %ebx
+ jae L(64bytesormore_loop)
+ add %ebx, %ecx
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+ .p2align 4
+L(find_16diff):
+ sub $16, %ecx
+L(find_32diff):
+ sub $16, %ecx
+L(find_48diff):
+ sub $16, %ecx
+L(find_64diff):
+ add %ecx, %edx
+ add %ecx, %eax
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(49bytes):
+ movdqu -49(%eax), %xmm1
+ movdqu -49(%edx), %xmm2
+ mov $-49, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%eax), %xmm1
+ movdqu -33(%edx), %xmm2
+ mov $-33, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(50bytes):
+ mov $-50, %ebx
+ movdqu -50(%eax), %xmm1
+ movdqu -50(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ mov $-34, %ebx
+ movdqu -34(%eax), %xmm1
+ movdqu -34(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(51bytes):
+ mov $-51, %ebx
+ movdqu -51(%eax), %xmm1
+ movdqu -51(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ mov $-35, %ebx
+ movdqu -35(%eax), %xmm1
+ movdqu -35(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+L(1bytes):
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(52bytes):
+ movdqu -52(%eax), %xmm1
+ movdqu -52(%edx), %xmm2
+ mov $-52, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%eax), %xmm1
+ movdqu -36(%edx), %xmm2
+ mov $-36, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%eax), %xmm1
+ movdqu -20(%edx), %xmm2
+ mov $-20, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(53bytes):
+ movdqu -53(%eax), %xmm1
+ movdqu -53(%edx), %xmm2
+ mov $-53, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ mov $-37, %ebx
+ movdqu -37(%eax), %xmm1
+ movdqu -37(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ mov $-21, %ebx
+ movdqu -21(%eax), %xmm1
+ movdqu -21(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(54bytes):
+ movdqu -54(%eax), %xmm1
+ movdqu -54(%edx), %xmm2
+ mov $-54, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ mov $-38, %ebx
+ movdqu -38(%eax), %xmm1
+ movdqu -38(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ mov $-22, %ebx
+ movdqu -22(%eax), %xmm1
+ movdqu -22(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(55bytes):
+ movdqu -55(%eax), %xmm1
+ movdqu -55(%edx), %xmm2
+ mov $-55, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ mov $-39, %ebx
+ movdqu -39(%eax), %xmm1
+ movdqu -39(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ mov $-23, %ebx
+ movdqu -23(%eax), %xmm1
+ movdqu -23(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(56bytes):
+ movdqu -56(%eax), %xmm1
+ movdqu -56(%edx), %xmm2
+ mov $-56, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ mov $-40, %ebx
+ movdqu -40(%eax), %xmm1
+ movdqu -40(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ mov $-24, %ebx
+ movdqu -24(%eax), %xmm1
+ movdqu -24(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(57bytes):
+ movdqu -57(%eax), %xmm1
+ movdqu -57(%edx), %xmm2
+ mov $-57, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ mov $-41, %ebx
+ movdqu -41(%eax), %xmm1
+ movdqu -41(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ mov $-25, %ebx
+ movdqu -25(%eax), %xmm1
+ movdqu -25(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(58bytes):
+ movdqu -58(%eax), %xmm1
+ movdqu -58(%edx), %xmm2
+ mov $-58, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ mov $-42, %ebx
+ movdqu -42(%eax), %xmm1
+ movdqu -42(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ mov $-26, %ebx
+ movdqu -26(%eax), %xmm1
+ movdqu -26(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(59bytes):
+ movdqu -59(%eax), %xmm1
+ movdqu -59(%edx), %xmm2
+ mov $-59, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ mov $-43, %ebx
+ movdqu -43(%eax), %xmm1
+ movdqu -43(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ mov $-27, %ebx
+ movdqu -27(%eax), %xmm1
+ movdqu -27(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+ .p2align 4
+L(60bytes):
+ movdqu -60(%eax), %xmm1
+ movdqu -60(%edx), %xmm2
+ mov $-60, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ mov $-44, %ebx
+ movdqu -44(%eax), %xmm1
+ movdqu -44(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ mov $-28, %ebx
+ movdqu -28(%eax), %xmm1
+ movdqu -28(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(61bytes):
+ movdqu -61(%eax), %xmm1
+ movdqu -61(%edx), %xmm2
+ mov $-61, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ mov $-45, %ebx
+ movdqu -45(%eax), %xmm1
+ movdqu -45(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ mov $-29, %ebx
+ movdqu -29(%eax), %xmm1
+ movdqu -29(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(62bytes):
+ movdqu -62(%eax), %xmm1
+ movdqu -62(%edx), %xmm2
+ mov $-62, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ mov $-46, %ebx
+ movdqu -46(%eax), %xmm1
+ movdqu -46(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ mov $-30, %ebx
+ movdqu -30(%eax), %xmm1
+ movdqu -30(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(63bytes):
+ movdqu -63(%eax), %xmm1
+ movdqu -63(%edx), %xmm2
+ mov $-63, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ mov $-47, %ebx
+ movdqu -47(%eax), %xmm1
+ movdqu -47(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ mov $-31, %ebx
+ movdqu -31(%eax), %xmm1
+ movdqu -31(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+# endif
+
+ .p2align 4
+L(64bytes):
+ movdqu -64(%eax), %xmm1
+ movdqu -64(%edx), %xmm2
+ mov $-64, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%eax), %xmm1
+ movdqu -48(%edx), %xmm2
+ mov $-48, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%eax), %xmm1
+ movdqu -32(%edx), %xmm2
+ mov $-32, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -16(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ mov (%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ mov 4(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ mov 8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ mov 12(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
+
+ .p2align 4
+L(find_diff):
+# ifndef USE_AS_WMEMCMP
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+# else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+# endif
+END (MEMCMP)
+
+ .section .rodata.sse4.2,"a",@progbits
+ .p2align 2
+ .type L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000000..5ebf5a4d73
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -0,0 +1,2157 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ atom_text_section
+ENTRY (MEMCMP)
+ movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
+ movl BLK1(%esp), %eax
+ cmp $48, %ecx
+ movl BLK2(%esp), %edx
+ jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
+ cmp $1, %ecx
+ jbe L(less1bytes)
+# endif
+
+ PUSH (%ebx)
+ add %ecx, %edx
+ add %ecx, %eax
+ jmp L(less48bytes)
+
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(zero)
+ movb (%eax), %cl
+ cmp (%edx), %cl
+ je L(zero)
+ mov $1, %eax
+ ja L(1bytesend)
+ neg %eax
+L(1bytesend):
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(48bytesormore):
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
+ cfi_remember_state
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
+ movl %eax, %edi
+ movl %edx, %esi
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%edi), %edi
+
+ sub $0xffff, %edx
+ lea 16(%esi), %esi
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %edx, %edi
+ sub %edx, %esi
+ add %edx, %ecx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %edx, %esi
+
+# ifndef USE_AS_WMEMCMP
+ cmp $8, %edx
+ jae L(next_unaligned_table)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $1, %edx
+ je L(shr_1)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $3, %edx
+ je L(shr_3)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $5, %edx
+ je L(shr_5)
+ cmp $6, %edx
+ je L(shr_6)
+ jmp L(shr_7)
+
+ .p2align 2
+L(next_unaligned_table):
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $9, %edx
+ je L(shr_9)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $11, %edx
+ je L(shr_11)
+ cmp $12, %edx
+ je L(shr_12)
+ cmp $13, %edx
+ je L(shr_13)
+ cmp $14, %edx
+ je L(shr_14)
+ jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
+
+ .p2align 4
+L(shr_0):
+ cmp $80, %ecx
+ jae L(shr_0_gobble)
+ lea -48(%ecx), %ecx
+ xor %eax, %eax
+ movaps (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+ movaps 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ add $32, %edi
+ add $32, %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_0_gobble):
+ lea -48(%ecx), %ecx
+ movdqa (%esi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+L(shr_0_gobble_loop):
+ pand %xmm0, %xmm2
+ sub $32, %ecx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%esi), %xmm0
+ movdqa 48(%esi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%edi), %xmm0
+ pcmpeqb 48(%edi), %xmm2
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %ecx
+ jge L(shr_0_gobble_loop_next)
+ inc %edx
+ add $32, %ecx
+L(shr_0_gobble_loop_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_1):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_1_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $1,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $1,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 1(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_1_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $1,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $1,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_1_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $1,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $1,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_1_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_1_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_1_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 1(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_2):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $2,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_2_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $2,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $2,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_2_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $2,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $2,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_2_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_3):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_3_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $3,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $3,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 3(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_3_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $3,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $3,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_3_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $3,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $3,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_3_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_3_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_3_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 3(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_4):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $4,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_4_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $4,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $4,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_4_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $4,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $4,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_4_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_5):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_5_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $5,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $5,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 5(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_5_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $5,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $5,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_5_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $5,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $5,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_5_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_5_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_5_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 5(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_6):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $6,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_6_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $6,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $6,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_6_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $6,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $6,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_6_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_7):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_7_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $7,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $7,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 7(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_7_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $7,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $7,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_7_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $7,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $7,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_7_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_7_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_7_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 7(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_8):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $8,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_8_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $8,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $8,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_8_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $8,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $8,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_8_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_9):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_9_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $9,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $9,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 9(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_9_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $9,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $9,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_9_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $9,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $9,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_9_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_9_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_9_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 9(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_10):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $10,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_10_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $10, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $10, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_10_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $10,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $10,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_10_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_11):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_11_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $11, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $11, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 11(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_11_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $11, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $11, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_11_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $11,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $11,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_11_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_11_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_11_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 11(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_12):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_12_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $12, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $12, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_12_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $12,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $12,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_12_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_13):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_13_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $13, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $13, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 13(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_13_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $13, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $13, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_13_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $13,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $13,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_13_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_13_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_13_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 13(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_14):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_14_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $14, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $14, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_14_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $14,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $14,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_14_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_15):
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_15_gobble)
+
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $15, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+
+ movdqa 32(%esi), %xmm3
+ palignr $15, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 15(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(shr_15_gobble):
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $15, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
+
+ movdqa 32(%esi), %xmm3
+ palignr $15, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
+
+L(shr_15_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%esi), %xmm3
+ palignr $15,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $15,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
+
+ lea 32(%edi), %edi
+ jz L(shr_15_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %ecx
+ jge L(shr_15_gobble_next)
+ inc %edx
+ add $32, %ecx
+L(shr_15_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea (%ecx, %edi,1), %eax
+ lea 15(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
+# endif
+
+ cfi_restore_state
+ cfi_remember_state
+ .p2align 4
+L(exit):
+ pmovmskb %xmm1, %ebx
+ sub $0xffff, %ebx
+ jz L(first16bytes)
+ lea -16(%esi), %esi
+ lea -16(%edi), %edi
+ mov %ebx, %edx
+
+L(first16bytes):
+ add %eax, %esi
+L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
+ test %dl, %dl
+ jz L(next_24_bytes)
+
+ test $0x01, %dl
+ jnz L(Byte16)
+
+ test $0x02, %dl
+ jnz L(Byte17)
+
+ test $0x04, %dl
+ jnz L(Byte18)
+
+ test $0x08, %dl
+ jnz L(Byte19)
+
+ test $0x10, %dl
+ jnz L(Byte20)
+
+ test $0x20, %dl
+ jnz L(Byte21)
+
+ test $0x40, %dl
+ jnz L(Byte22)
+L(Byte23):
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte16):
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte17):
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte18):
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte19):
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte20):
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte21):
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(Byte22):
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
+ sub %edx, %eax
+ RETURN
+
+ .p2align 4
+L(next_24_bytes):
+ lea 8(%edi), %edi
+ lea 8(%esi), %esi
+ test $0x01, %dh
+ jnz L(Byte16)
+
+ test $0x02, %dh
+ jnz L(Byte17)
+
+ test $0x04, %dh
+ jnz L(Byte18)
+
+ test $0x08, %dh
+ jnz L(Byte19)
+
+ test $0x10, %dh
+ jnz L(Byte20)
+
+ test $0x20, %dh
+ jnz L(Byte21)
+
+ test $0x40, %dh
+ jnz L(Byte22)
+
+ .p2align 4
+L(Byte31):
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
+ sub %edx, %eax
+ RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
+
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(more8bytes):
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $9, %ecx
+ je L(9bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $11, %ecx
+ je L(11bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ cmp $13, %ecx
+ je L(13bytes)
+ cmp $14, %ecx
+ je L(14bytes)
+ jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
+
+ .p2align 4
+L(more16bytes):
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $17, %ecx
+ je L(17bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $19, %ecx
+ je L(19bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ cmp $21, %ecx
+ je L(21bytes)
+ cmp $22, %ecx
+ je L(22bytes)
+ jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
+
+ .p2align 4
+L(more24bytes):
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $25, %ecx
+ je L(25bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $27, %ecx
+ je L(27bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ cmp $29, %ecx
+ je L(29bytes)
+ cmp $30, %ecx
+ je L(30bytes)
+ jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
+
+ .p2align 4
+L(more32bytes):
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $33, %ecx
+ je L(33bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $35, %ecx
+ je L(35bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ cmp $37, %ecx
+ je L(37bytes)
+ cmp $38, %ecx
+ je L(38bytes)
+ jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
+
+ .p2align 4
+L(more40bytes):
+ cmp $40, %ecx
+ je L(40bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $41, %ecx
+ je L(41bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $43, %ecx
+ je L(43bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ cmp $45, %ecx
+ je L(45bytes)
+ cmp $46, %ecx
+ je L(46bytes)
+ jmp L(47bytes)
+
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ mov -44(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ mov -40(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ mov -36(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ mov -32(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ mov -28(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ mov -24(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ mov -20(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# endif
+
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
+L(45bytes):
+ mov -45(%eax), %ecx
+ mov -45(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(41bytes):
+ mov -41(%eax), %ecx
+ mov -41(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(37bytes):
+ mov -37(%eax), %ecx
+ mov -37(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(33bytes):
+ mov -33(%eax), %ecx
+ mov -33(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(29bytes):
+ mov -29(%eax), %ecx
+ mov -29(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(25bytes):
+ mov -25(%eax), %ecx
+ mov -25(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(21bytes):
+ mov -21(%eax), %ecx
+ mov -21(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(46bytes):
+ mov -46(%eax), %ecx
+ mov -46(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(42bytes):
+ mov -42(%eax), %ecx
+ mov -42(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(38bytes):
+ mov -38(%eax), %ecx
+ mov -38(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(34bytes):
+ mov -34(%eax), %ecx
+ mov -34(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(30bytes):
+ mov -30(%eax), %ecx
+ mov -30(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(26bytes):
+ mov -26(%eax), %ecx
+ mov -26(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(22bytes):
+ mov -22(%eax), %ecx
+ mov -22(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(47bytes):
+ movl -47(%eax), %ecx
+ movl -47(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(43bytes):
+ movl -43(%eax), %ecx
+ movl -43(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(39bytes):
+ movl -39(%eax), %ecx
+ movl -39(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(35bytes):
+ movl -35(%eax), %ecx
+ movl -35(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(31bytes):
+ movl -31(%eax), %ecx
+ movl -31(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%eax), %ecx
+ movl -27(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%eax), %ecx
+ movl -23(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+
+ .p2align 4
+L(find_diff):
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+
+ .p2align 4
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+# else
+
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
new file mode 100644
index 0000000000..1fc5994a17
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S
@@ -0,0 +1,62 @@
+/* Multiple versions of memcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(memcmp)
+ .type memcmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
+2: ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcmp_ia32, @function; \
+ .p2align 4; \
+ .globl __memcmp_ia32; \
+ .hidden __memcmp_ia32; \
+ __memcmp_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..2fe2072cb1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_sse2_unaligned
+# define MEMCPY_CHK __memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+
+ .section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+ cmp %edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+ jg L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmpl $32, %ecx
+ jg L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_backward):
+ cmpl $64, %ecx
+ jg L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_backward):
+ cmpl $128, %ecx
+ jg L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_backward):
+ add %ecx, %eax
+ cmp %edx, %eax
+ movl SRC(%esp), %eax
+ jle L(forward)
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu (%eax), %xmm4
+ movdqu 16(%eax), %xmm5
+ movdqu 32(%eax), %xmm6
+ movdqu 48(%eax), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu -16(%eax, %ecx), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ movl %esi, %ecx
+ andl $-16, %ecx
+ leal (%ecx), %ebx
+ subl %edx, %ebx
+ leal (%eax, %ebx), %eax
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG (bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%eax)
+
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movaps %xmm0, -64(%ecx)
+ subl $64, %eax
+ movaps %xmm1, -48(%ecx)
+ movaps %xmm2, -32(%ecx)
+ movaps %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_backward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %cl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %cl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_backward):
+ PUSH (%esi)
+ movl -4(%eax,%ecx), %ebx
+ movl -8(%eax,%ecx), %esi
+ movl %ebx, -4(%edx,%ecx)
+ movl %esi, -8(%edx,%ecx)
+ subl $8, %ecx
+ POP (%esi)
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+/* Big length copy backward part. */
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movntdq %xmm0, -64(%ecx)
+ subl $64, %eax
+ movntdq %xmm1, -48(%ecx)
+ movntdq %xmm2, -32(%ecx)
+ movntdq %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_backward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(check_forward):
+ add %edx, %ecx
+ cmp %eax, %ecx
+ movl LEN(%esp), %ecx
+ jle L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmpl $32, %ecx
+ ja L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_forward):
+ cmpl $64, %ecx
+ ja L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_forward):
+ cmpl $128, %ecx
+ ja L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_forward):
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu -16(%eax, %ecx), %xmm4
+ movdqu -32(%eax, %ecx), %xmm5
+ movdqu -48(%eax, %ecx), %xmm6
+ movdqu -64(%eax, %ecx), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu (%eax), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ leal 16(%edx), %ecx
+ andl $-16, %ecx
+ movl %ecx, %ebx
+ subl %edx, %ebx
+ addl %ebx, %eax
+ movl %esi, %ebx
+ subl %ecx, %ebx
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%eax)
+
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqa %xmm0, (%ecx)
+ addl $64, %eax
+ movaps %xmm1, 16(%ecx)
+ movaps %xmm2, 32(%ecx)
+ movaps %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_forward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %cl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_forward):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(mm_return_pop_all):
+ movl %edx, %eax
+ POP (%edi)
+ POP (%esi)
+ RETURN
+
+/* Big length copy forward part. */
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movntdq %xmm0, (%ecx)
+ addl $64, %eax
+ movntdq %xmm1, 16(%ecx)
+ movntdq %xmm2, 32(%ecx)
+ movntdq %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_forward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+# endif
+
+L(forward):
+ cmp $16, %ecx
+ jbe L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+# endif
+ jae L(large_page)
+
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ cmpl $32, %ecx
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 16(%eax), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ cmpl $64, %ecx
+ movdqu %xmm0, 16(%edx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 32(%eax), %xmm0
+ movdqu 48(%eax), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+ cmpl $128, %ecx
+ movdqu %xmm0, 32(%edx)
+ movdqu %xmm1, 48(%edx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ jbe L(return)
+
+/* Now the main loop: we align the address of the destination. */
+ leal 64(%edx), %ebx
+ andl $-64, %ebx
+
+ addl %edx, %ecx
+ andl $-64, %ecx
+
+ subl %edx, %eax
+
+/* We should stop two iterations before the termination
+ (in order not to misprefetch). */
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_just_one_iteration)
+
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_last_two_iterations)
+
+ .p2align 4
+L(main_loop_cache):
+
+ prefetcht0 128(%ebx, %eax)
+
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ lea 64(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ movaps %xmm4, 64(%ebx)
+ movaps %xmm5, 80(%ebx)
+ movaps %xmm6, 96(%ebx)
+ movaps %xmm7, 112(%ebx)
+ jmp L(return)
+
+L(main_loop_just_one_iteration):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ jmp L(return)
+
+L(large_page):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+
+ movdqu 64(%eax), %xmm0
+ movdqu 80(%eax), %xmm1
+ movdqu 96(%eax), %xmm2
+ movdqu 112(%eax), %xmm3
+ movdqu -128(%eax, %ecx), %xmm4
+ movdqu -112(%eax, %ecx), %xmm5
+ movdqu -96(%eax, %ecx), %xmm6
+ movdqu -80(%eax, %ecx), %xmm7
+ movdqu %xmm0, 64(%edx)
+ movdqu %xmm1, 80(%edx)
+ movdqu %xmm2, 96(%edx)
+ movdqu %xmm3, 112(%edx)
+ movdqu %xmm4, -128(%edx, %ecx)
+ movdqu %xmm5, -112(%edx, %ecx)
+ movdqu %xmm6, -96(%edx, %ecx)
+ movdqu %xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+ the address of the destination. */
+ leal 128(%edx), %ebx
+ andl $-128, %ebx
+
+ addl %edx, %ecx
+ andl $-128, %ecx
+
+ subl %edx, %eax
+
+ .p2align 4
+L(main_loop_large_page):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movntdq %xmm0, (%ebx)
+ movntdq %xmm1, 16(%ebx)
+ movntdq %xmm2, 32(%ebx)
+ movntdq %xmm3, 48(%ebx)
+ movntdq %xmm4, 64(%ebx)
+ movntdq %xmm5, 80(%ebx)
+ movntdq %xmm6, 96(%ebx)
+ movntdq %xmm7, 112(%ebx)
+ lea 128(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_large_page)
+ sfence
+ jmp L(return)
+
+L(len_0_16_bytes):
+ testb $24, %cl
+ jne L(len_9_16_bytes)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(len_5_8_bytes)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%eax), %ebx
+ testb $2, %cl
+ movb %bl, (%edx)
+ je L(return)
+ movzwl -2(%eax,%ecx), %ebx
+ movw %bx, -2(%edx,%ecx)
+ jmp L(return)
+
+L(len_9_16_bytes):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(len_5_8_bytes):
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ movl -4(%eax,%ecx), %ebx
+ movl %ebx, -4(%edx,%ecx)
+
+L(return):
+ movl %edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+ RETURN
+
+END (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 0000000000..687e083147
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1809 @@
+/* memcpy with SSSE3 and REP string.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3_rep
+# define MEMCPY_CHK __memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
+ addl $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+#else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $48, %ecx
+ jb L(bk_write_less48bytes)
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+#endif
+ cmp $48, %ecx
+ jae L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jb L(bk_write)
+#endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(48bytesormore):
+ movdqu (%eax), %xmm0
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ PUSH (%esi)
+ cfi_remember_state
+ add $16, %edx
+ movl %edi, %esi
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+ mov %eax, %edi
+ jae L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ ALIGN (4)
+L(shl_0):
+ movdqu %xmm0, (%esi)
+ xor %edi, %edi
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+ mov __x86_data_cache_size_half, %edi
+# endif
+#endif
+ mov %edi, %esi
+ shr $3, %esi
+ sub %esi, %edi
+ cmp %edi, %ecx
+ jae L(shl_0_gobble_mem_start)
+ sub $128, %ecx
+ ALIGN (4)
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_cache_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_0_gobble_mem_start):
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+ sub $128, %ecx
+L(shl_0_gobble_mem_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ prefetchnta 0x1c0(%edx)
+ prefetchnta 0x280(%edx)
+
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_mem_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_1):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $1, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_1_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_1_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_1_loop)
+
+L(shl_1_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_2):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $2, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_2_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_2_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_2_loop)
+
+L(shl_2_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_3):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $3, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_3_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_3_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_3_loop)
+
+L(shl_3_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_4):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $4, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_4_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_4_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_4_loop)
+
+L(shl_4_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_5):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $5, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_5_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_5_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_5_loop)
+
+L(shl_5_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_6):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $6, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_6_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_6_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_6_loop)
+
+L(shl_6_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_7):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $7, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_7_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_7_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_7_loop)
+
+L(shl_7_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_8):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $8, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_8_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_8_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_8_loop)
+
+L(shl_8_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_9):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $9, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_9_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_9_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_9_loop)
+
+L(shl_9_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_10):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $10, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_10_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_10_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_10_loop)
+
+L(shl_10_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_11):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $11, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_11_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_11_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_11_loop)
+
+L(shl_11_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_12):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $12, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_12_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_12_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_12_loop)
+
+L(shl_12_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_13):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $13, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_13_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_13_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_13_loop)
+
+L(shl_13_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_14):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $14, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_14_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_14_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_14_loop)
+
+L(shl_14_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(shl_15):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ sub $15, %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ sub $32, %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_15_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(shl_15_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_15_loop)
+
+L(shl_15_end):
+ add $32, %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(fwd_write_44bytes):
+ movl -44(%eax), %ecx
+ movl %ecx, -44(%edx)
+L(fwd_write_40bytes):
+ movl -40(%eax), %ecx
+ movl %ecx, -40(%edx)
+L(fwd_write_36bytes):
+ movl -36(%eax), %ecx
+ movl %ecx, -36(%edx)
+L(fwd_write_32bytes):
+ movl -32(%eax), %ecx
+ movl %ecx, -32(%edx)
+L(fwd_write_28bytes):
+ movl -28(%eax), %ecx
+ movl %ecx, -28(%edx)
+L(fwd_write_24bytes):
+ movl -24(%eax), %ecx
+ movl %ecx, -24(%edx)
+L(fwd_write_20bytes):
+ movl -20(%eax), %ecx
+ movl %ecx, -20(%edx)
+L(fwd_write_16bytes):
+ movl -16(%eax), %ecx
+ movl %ecx, -16(%edx)
+L(fwd_write_12bytes):
+ movl -12(%eax), %ecx
+ movl %ecx, -12(%edx)
+L(fwd_write_8bytes):
+ movl -8(%eax), %ecx
+ movl %ecx, -8(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_45bytes):
+ movl -45(%eax), %ecx
+ movl %ecx, -45(%edx)
+L(fwd_write_41bytes):
+ movl -41(%eax), %ecx
+ movl %ecx, -41(%edx)
+L(fwd_write_37bytes):
+ movl -37(%eax), %ecx
+ movl %ecx, -37(%edx)
+L(fwd_write_33bytes):
+ movl -33(%eax), %ecx
+ movl %ecx, -33(%edx)
+L(fwd_write_29bytes):
+ movl -29(%eax), %ecx
+ movl %ecx, -29(%edx)
+L(fwd_write_25bytes):
+ movl -25(%eax), %ecx
+ movl %ecx, -25(%edx)
+L(fwd_write_21bytes):
+ movl -21(%eax), %ecx
+ movl %ecx, -21(%edx)
+L(fwd_write_17bytes):
+ movl -17(%eax), %ecx
+ movl %ecx, -17(%edx)
+L(fwd_write_13bytes):
+ movl -13(%eax), %ecx
+ movl %ecx, -13(%edx)
+L(fwd_write_9bytes):
+ movl -9(%eax), %ecx
+ movl %ecx, -9(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_46bytes):
+ movl -46(%eax), %ecx
+ movl %ecx, -46(%edx)
+L(fwd_write_42bytes):
+ movl -42(%eax), %ecx
+ movl %ecx, -42(%edx)
+L(fwd_write_38bytes):
+ movl -38(%eax), %ecx
+ movl %ecx, -38(%edx)
+L(fwd_write_34bytes):
+ movl -34(%eax), %ecx
+ movl %ecx, -34(%edx)
+L(fwd_write_30bytes):
+ movl -30(%eax), %ecx
+ movl %ecx, -30(%edx)
+L(fwd_write_26bytes):
+ movl -26(%eax), %ecx
+ movl %ecx, -26(%edx)
+L(fwd_write_22bytes):
+ movl -22(%eax), %ecx
+ movl %ecx, -22(%edx)
+L(fwd_write_18bytes):
+ movl -18(%eax), %ecx
+ movl %ecx, -18(%edx)
+L(fwd_write_14bytes):
+ movl -14(%eax), %ecx
+ movl %ecx, -14(%edx)
+L(fwd_write_10bytes):
+ movl -10(%eax), %ecx
+ movl %ecx, -10(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_47bytes):
+ movl -47(%eax), %ecx
+ movl %ecx, -47(%edx)
+L(fwd_write_43bytes):
+ movl -43(%eax), %ecx
+ movl %ecx, -43(%edx)
+L(fwd_write_39bytes):
+ movl -39(%eax), %ecx
+ movl %ecx, -39(%edx)
+L(fwd_write_35bytes):
+ movl -35(%eax), %ecx
+ movl %ecx, -35(%edx)
+L(fwd_write_31bytes):
+ movl -31(%eax), %ecx
+ movl %ecx, -31(%edx)
+L(fwd_write_27bytes):
+ movl -27(%eax), %ecx
+ movl %ecx, -27(%edx)
+L(fwd_write_23bytes):
+ movl -23(%eax), %ecx
+ movl %ecx, -23(%edx)
+L(fwd_write_19bytes):
+ movl -19(%eax), %ecx
+ movl %ecx, -19(%edx)
+L(fwd_write_15bytes):
+ movl -15(%eax), %ecx
+ movl %ecx, -15(%edx)
+L(fwd_write_11bytes):
+ movl -11(%eax), %ecx
+ movl %ecx, -11(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN_END
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(large_page):
+ movdqu (%eax), %xmm1
+ movdqu %xmm0, (%esi)
+ movntdq %xmm1, (%edx)
+ add $0x10, %eax
+ add $0x10, %edx
+ sub $0x10, %ecx
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+L(large_page_loop_init):
+ POP (%esi)
+ sub $0x80, %ecx
+ POP (%edi)
+L(large_page_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ lfence
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ add $0x80, %ecx
+ cmp $0x40, %ecx
+ jb L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jb L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ cfi_restore_state
+ cfi_remember_state
+ ALIGN (4)
+L(copy_page_by_rep):
+ mov %eax, %esi
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep movsl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movzwl (%esi), %eax
+ movw %ax, (%edi)
+ add $2, %esi
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movzbl (%esi), %eax
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%esi)
+ POP (%edi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_44bytes):
+ movl 40(%eax), %ecx
+ movl %ecx, 40(%edx)
+L(bk_write_40bytes):
+ movl 36(%eax), %ecx
+ movl %ecx, 36(%edx)
+L(bk_write_36bytes):
+ movl 32(%eax), %ecx
+ movl %ecx, 32(%edx)
+L(bk_write_32bytes):
+ movl 28(%eax), %ecx
+ movl %ecx, 28(%edx)
+L(bk_write_28bytes):
+ movl 24(%eax), %ecx
+ movl %ecx, 24(%edx)
+L(bk_write_24bytes):
+ movl 20(%eax), %ecx
+ movl %ecx, 20(%edx)
+L(bk_write_20bytes):
+ movl 16(%eax), %ecx
+ movl %ecx, 16(%edx)
+L(bk_write_16bytes):
+ movl 12(%eax), %ecx
+ movl %ecx, 12(%edx)
+L(bk_write_12bytes):
+ movl 8(%eax), %ecx
+ movl %ecx, 8(%edx)
+L(bk_write_8bytes):
+ movl 4(%eax), %ecx
+ movl %ecx, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_45bytes):
+ movl 41(%eax), %ecx
+ movl %ecx, 41(%edx)
+L(bk_write_41bytes):
+ movl 37(%eax), %ecx
+ movl %ecx, 37(%edx)
+L(bk_write_37bytes):
+ movl 33(%eax), %ecx
+ movl %ecx, 33(%edx)
+L(bk_write_33bytes):
+ movl 29(%eax), %ecx
+ movl %ecx, 29(%edx)
+L(bk_write_29bytes):
+ movl 25(%eax), %ecx
+ movl %ecx, 25(%edx)
+L(bk_write_25bytes):
+ movl 21(%eax), %ecx
+ movl %ecx, 21(%edx)
+L(bk_write_21bytes):
+ movl 17(%eax), %ecx
+ movl %ecx, 17(%edx)
+L(bk_write_17bytes):
+ movl 13(%eax), %ecx
+ movl %ecx, 13(%edx)
+L(bk_write_13bytes):
+ movl 9(%eax), %ecx
+ movl %ecx, 9(%edx)
+L(bk_write_9bytes):
+ movl 5(%eax), %ecx
+ movl %ecx, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_46bytes):
+ movl 42(%eax), %ecx
+ movl %ecx, 42(%edx)
+L(bk_write_42bytes):
+ movl 38(%eax), %ecx
+ movl %ecx, 38(%edx)
+L(bk_write_38bytes):
+ movl 34(%eax), %ecx
+ movl %ecx, 34(%edx)
+L(bk_write_34bytes):
+ movl 30(%eax), %ecx
+ movl %ecx, 30(%edx)
+L(bk_write_30bytes):
+ movl 26(%eax), %ecx
+ movl %ecx, 26(%edx)
+L(bk_write_26bytes):
+ movl 22(%eax), %ecx
+ movl %ecx, 22(%edx)
+L(bk_write_22bytes):
+ movl 18(%eax), %ecx
+ movl %ecx, 18(%edx)
+L(bk_write_18bytes):
+ movl 14(%eax), %ecx
+ movl %ecx, 14(%edx)
+L(bk_write_14bytes):
+ movl 10(%eax), %ecx
+ movl %ecx, 10(%edx)
+L(bk_write_10bytes):
+ movl 6(%eax), %ecx
+ movl %ecx, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_47bytes):
+ movl 43(%eax), %ecx
+ movl %ecx, 43(%edx)
+L(bk_write_43bytes):
+ movl 39(%eax), %ecx
+ movl %ecx, 39(%edx)
+L(bk_write_39bytes):
+ movl 35(%eax), %ecx
+ movl %ecx, 35(%edx)
+L(bk_write_35bytes):
+ movl 31(%eax), %ecx
+ movl %ecx, 31(%edx)
+L(bk_write_31bytes):
+ movl 27(%eax), %ecx
+ movl %ecx, 27(%edx)
+L(bk_write_27bytes):
+ movl 23(%eax), %ecx
+ movl %ecx, 23(%edx)
+L(bk_write_23bytes):
+ movl 19(%eax), %ecx
+ movl %ecx, 19(%edx)
+L(bk_write_19bytes):
+ movl 15(%eax), %ecx
+ movl %ecx, 15(%edx)
+L(bk_write_15bytes):
+ movl 11(%eax), %ecx
+ movl %ecx, 11(%edx)
+L(bk_write_11bytes):
+ movl 7(%eax), %ecx
+ movl %ecx, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ ALIGN (2)
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ ALIGN (2)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (2)
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+ PUSH (%esi)
+ movl %eax, %esi
+ add %ecx, %edx
+ add %ecx, %esi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jae L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jb L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movl -4(%esi), %eax
+ movl %eax, -4(%edx)
+ movl -8(%esi), %eax
+ movl %eax, -8(%edx)
+ movl -12(%esi), %eax
+ movl %eax, -12(%edx)
+ movl -16(%esi), %eax
+ movl %eax, -16(%edx)
+ movl -20(%esi), %eax
+ movl %eax, -20(%edx)
+ movl -24(%esi), %eax
+ movl %eax, -24(%edx)
+ movl -28(%esi), %eax
+ movl %eax, -28(%edx)
+ movl -32(%esi), %eax
+ movl %eax, -32(%edx)
+ sub $32, %edx
+ sub $32, %esi
+
+L(bk_write_less32bytes):
+ movl %esi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%esi)
+L(bk_write_less48bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ CFI_PUSH (%esi)
+ ALIGN (4)
+L(bk_align):
+ cmp $8, %ecx
+ jbe L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %esi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%esi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %esi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%esi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ ALIGN (4)
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jb L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+ sub $64, %esi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%esi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%esi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%esi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%esi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jae L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000000..53e8a6ca1d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3162 @@
+/* memcpy with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# endif
+
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx, INDEX, SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+# else
+
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(, INDEX, SCALE)
+# endif
+
+ .section .text.ssse3,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+# ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $32, %ecx
+ jae L(memmove_bwd)
+ jmp L(bk_write_less32bytes_2)
+
+ .p2align 4
+L(memmove_bwd):
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+# endif
+ cmp $48, %ecx
+ jae L(48bytesormore)
+
+L(fwd_write_less32bytes):
+# ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jb L(bk_write)
+# endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+# ifndef USE_AS_MEMMOVE
+ .p2align 4
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+# endif
+
+ .p2align 4
+L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+ movlpd (%eax), %xmm0
+ movlpd 8(%eax), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+# else
+ movdqu (%eax), %xmm0
+# endif
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+# endif
+
+ mov %eax, %edi
+ jae L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ .p2align 4
+L(shl_0):
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
+ xor %edi, %edi
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+
+ .p2align 4
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jb L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ POP (%edi)
+ lea -128(%ecx), %ecx
+ jae L(shl_0_gobble_mem_loop)
+
+ .p2align 4
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_cache_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(shl_0_gobble_mem_loop):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x280(%eax)
+ prefetcht0 0x1c0(%edx)
+
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jae L(shl_0_gobble_mem_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jb L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jb L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
+
+ .p2align 4
+L(shl_1):
+# ifndef USE_AS_MEMMOVE
+ movaps -1(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -1(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_1_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl1LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ movaps 47(%eax), %xmm4
+ movaps 63(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $1, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_1_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -1(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_1_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_1_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_1_no_prefetch_loop)
+
+L(sh_1_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_2):
+# ifndef USE_AS_MEMMOVE
+ movaps -2(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -2(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_2_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl2LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ movaps 46(%eax), %xmm4
+ movaps 62(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $2, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_2_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -2(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_2_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_2_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_2_no_prefetch_loop)
+
+L(sh_2_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_3):
+# ifndef USE_AS_MEMMOVE
+ movaps -3(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -3(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_3_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl3LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ movaps 45(%eax), %xmm4
+ movaps 61(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $3, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_3_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -3(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_3_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_3_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_3_no_prefetch_loop)
+
+L(sh_3_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_4):
+# ifndef USE_AS_MEMMOVE
+ movaps -4(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -4(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_4_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl4LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ movaps 44(%eax), %xmm4
+ movaps 60(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $4, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_4_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -4(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_4_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_4_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_4_no_prefetch_loop)
+
+L(sh_4_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_5):
+# ifndef USE_AS_MEMMOVE
+ movaps -5(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -5(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_5_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl5LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ movaps 43(%eax), %xmm4
+ movaps 59(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ palignr $5, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $5, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_5_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -5(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_5_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_5_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_5_no_prefetch_loop)
+
+L(sh_5_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_6):
+# ifndef USE_AS_MEMMOVE
+ movaps -6(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -6(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_6_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl6LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ movaps 42(%eax), %xmm4
+ movaps 58(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ palignr $6, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $6, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_6_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -6(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_6_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jb L(sh_6_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(sh_6_no_prefetch_loop)
+
+L(sh_6_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_7):
+# ifndef USE_AS_MEMMOVE
+ movaps -7(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -7(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_7_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl7LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ movaps 41(%eax), %xmm4
+ movaps 57(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ palignr $7, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $7, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_7_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -7(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_7_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_7_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_7_no_prefetch_loop)
+
+L(sh_7_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_8):
+# ifndef USE_AS_MEMMOVE
+ movaps -8(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -8(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_8_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl8LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ movaps 40(%eax), %xmm4
+ movaps 56(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $8, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl8LoopStart)
+
+L(LoopLeave8):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_8_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -8(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_8_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_8_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_8_no_prefetch_loop)
+
+L(sh_8_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_9):
+# ifndef USE_AS_MEMMOVE
+ movaps -9(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -9(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_9_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl9LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ movaps 39(%eax), %xmm4
+ movaps 55(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ palignr $9, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $9, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_9_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -9(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_9_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_9_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+ movaps -10(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -10(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_10_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl10LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ movaps 38(%eax), %xmm4
+ movaps 54(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ palignr $10, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $10, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_10_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -10(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_10_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_10_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_10_no_prefetch_loop)
+
+L(sh_10_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_11):
+# ifndef USE_AS_MEMMOVE
+ movaps -11(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -11(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_11_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl11LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ movaps 37(%eax), %xmm4
+ movaps 53(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ palignr $11, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $11, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_11_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -11(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_11_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_11_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_11_no_prefetch_loop)
+
+L(sh_11_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_12):
+# ifndef USE_AS_MEMMOVE
+ movaps -12(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -12(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_12_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl12LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ movaps 36(%eax), %xmm4
+ movaps 52(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $12, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_12_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -12(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_12_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_12_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_12_no_prefetch_loop)
+
+L(sh_12_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_13):
+# ifndef USE_AS_MEMMOVE
+ movaps -13(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -13(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_13_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl13LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ movaps 35(%eax), %xmm4
+ movaps 51(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ palignr $13, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $13, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_13_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -13(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_13_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_13_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_13_no_prefetch_loop)
+
+L(sh_13_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_14):
+# ifndef USE_AS_MEMMOVE
+ movaps -14(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -14(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_14_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl14LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ movaps 34(%eax), %xmm4
+ movaps 50(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ palignr $14, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $14, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_14_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -14(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_14_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_14_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_14_no_prefetch_loop)
+
+L(sh_14_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_15):
+# ifndef USE_AS_MEMMOVE
+ movaps -15(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -15(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_15_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl15LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ movaps 33(%eax), %xmm4
+ movaps 49(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ palignr $15, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $15, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_15_no_prefetch):
+ lea -32(%ecx), %ecx
+ lea -15(%eax), %eax
+ xor %edi, %edi
+
+ .p2align 4
+L(sh_15_no_prefetch_loop):
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jb L(sh_15_end_no_prefetch_loop)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_15_no_prefetch_loop)
+
+L(sh_15_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_end_0):
+ lea 32(%ecx), %ecx
+ lea (%edx, %ecx), %edx
+ lea (%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(fwd_write_44bytes):
+ movq -44(%eax), %xmm0
+ movq %xmm0, -44(%edx)
+L(fwd_write_36bytes):
+ movq -36(%eax), %xmm0
+ movq %xmm0, -36(%edx)
+L(fwd_write_28bytes):
+ movq -28(%eax), %xmm0
+ movq %xmm0, -28(%edx)
+L(fwd_write_20bytes):
+ movq -20(%eax), %xmm0
+ movq %xmm0, -20(%edx)
+L(fwd_write_12bytes):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes):
+ movq -40(%eax), %xmm0
+ movq %xmm0, -40(%edx)
+L(fwd_write_32bytes):
+ movq -32(%eax), %xmm0
+ movq %xmm0, -32(%edx)
+L(fwd_write_24bytes):
+ movq -24(%eax), %xmm0
+ movq %xmm0, -24(%edx)
+L(fwd_write_16bytes):
+ movq -16(%eax), %xmm0
+ movq %xmm0, -16(%edx)
+L(fwd_write_8bytes):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes):
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_45bytes):
+ movq -45(%eax), %xmm0
+ movq %xmm0, -45(%edx)
+L(fwd_write_37bytes):
+ movq -37(%eax), %xmm0
+ movq %xmm0, -37(%edx)
+L(fwd_write_29bytes):
+ movq -29(%eax), %xmm0
+ movq %xmm0, -29(%edx)
+L(fwd_write_21bytes):
+ movq -21(%eax), %xmm0
+ movq %xmm0, -21(%edx)
+L(fwd_write_13bytes):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes):
+ movq -41(%eax), %xmm0
+ movq %xmm0, -41(%edx)
+L(fwd_write_33bytes):
+ movq -33(%eax), %xmm0
+ movq %xmm0, -33(%edx)
+L(fwd_write_25bytes):
+ movq -25(%eax), %xmm0
+ movq %xmm0, -25(%edx)
+L(fwd_write_17bytes):
+ movq -17(%eax), %xmm0
+ movq %xmm0, -17(%edx)
+L(fwd_write_9bytes):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_46bytes):
+ movq -46(%eax), %xmm0
+ movq %xmm0, -46(%edx)
+L(fwd_write_38bytes):
+ movq -38(%eax), %xmm0
+ movq %xmm0, -38(%edx)
+L(fwd_write_30bytes):
+ movq -30(%eax), %xmm0
+ movq %xmm0, -30(%edx)
+L(fwd_write_22bytes):
+ movq -22(%eax), %xmm0
+ movq %xmm0, -22(%edx)
+L(fwd_write_14bytes):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes):
+ movq -42(%eax), %xmm0
+ movq %xmm0, -42(%edx)
+L(fwd_write_34bytes):
+ movq -34(%eax), %xmm0
+ movq %xmm0, -34(%edx)
+L(fwd_write_26bytes):
+ movq -26(%eax), %xmm0
+ movq %xmm0, -26(%edx)
+L(fwd_write_18bytes):
+ movq -18(%eax), %xmm0
+ movq %xmm0, -18(%edx)
+L(fwd_write_10bytes):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_47bytes):
+ movq -47(%eax), %xmm0
+ movq %xmm0, -47(%edx)
+L(fwd_write_39bytes):
+ movq -39(%eax), %xmm0
+ movq %xmm0, -39(%edx)
+L(fwd_write_31bytes):
+ movq -31(%eax), %xmm0
+ movq %xmm0, -31(%edx)
+L(fwd_write_23bytes):
+ movq -23(%eax), %xmm0
+ movq %xmm0, -23(%edx)
+L(fwd_write_15bytes):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes):
+ movq -43(%eax), %xmm0
+ movq %xmm0, -43(%edx)
+L(fwd_write_35bytes):
+ movq -35(%eax), %xmm0
+ movq %xmm0, -35(%edx)
+L(fwd_write_27bytes):
+ movq -27(%eax), %xmm0
+ movq %xmm0, -27(%edx)
+L(fwd_write_19bytes):
+ movq -19(%eax), %xmm0
+ movq %xmm0, -19(%edx)
+L(fwd_write_11bytes):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_40bytes_align):
+ movdqa -40(%eax), %xmm0
+ movdqa %xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+ movdqa -24(%eax), %xmm0
+ movdqa %xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_32bytes_align):
+ movdqa -32(%eax), %xmm0
+ movdqa %xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+ movdqa -16(%eax), %xmm0
+ movdqa %xmm0, -16(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_5bytes_align):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_45bytes_align):
+ movdqa -45(%eax), %xmm0
+ movdqa %xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+ movdqa -29(%eax), %xmm0
+ movdqa %xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_37bytes_align):
+ movdqa -37(%eax), %xmm0
+ movdqa %xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+ movdqa -21(%eax), %xmm0
+ movdqa %xmm0, -21(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_41bytes_align):
+ movdqa -41(%eax), %xmm0
+ movdqa %xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+ movdqa -25(%eax), %xmm0
+ movdqa %xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_33bytes_align):
+ movdqa -33(%eax), %xmm0
+ movdqa %xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+ movdqa -17(%eax), %xmm0
+ movdqa %xmm0, -17(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_46bytes_align):
+ movdqa -46(%eax), %xmm0
+ movdqa %xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+ movdqa -30(%eax), %xmm0
+ movdqa %xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_38bytes_align):
+ movdqa -38(%eax), %xmm0
+ movdqa %xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+ movdqa -22(%eax), %xmm0
+ movdqa %xmm0, -22(%edx)
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_42bytes_align):
+ movdqa -42(%eax), %xmm0
+ movdqa %xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+ movdqa -26(%eax), %xmm0
+ movdqa %xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_34bytes_align):
+ movdqa -34(%eax), %xmm0
+ movdqa %xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+ movdqa -18(%eax), %xmm0
+ movdqa %xmm0, -18(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_47bytes_align):
+ movdqa -47(%eax), %xmm0
+ movdqa %xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+ movdqa -31(%eax), %xmm0
+ movdqa %xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_39bytes_align):
+ movdqa -39(%eax), %xmm0
+ movdqa %xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+ movdqa -23(%eax), %xmm0
+ movdqa %xmm0, -23(%edx)
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_43bytes_align):
+ movdqa -43(%eax), %xmm0
+ movdqa %xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+ movdqa -27(%eax), %xmm0
+ movdqa %xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_35bytes_align):
+ movdqa -35(%eax), %xmm0
+ movdqa %xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+ movdqa -19(%eax), %xmm0
+ movdqa %xmm0, -19(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_44bytes_align):
+ movdqa -44(%eax), %xmm0
+ movdqa %xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+ movdqa -28(%eax), %xmm0
+ movdqa %xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(fwd_write_36bytes_align):
+ movdqa -36(%eax), %xmm0
+ movdqa %xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+ movdqa -20(%eax), %xmm0
+ movdqa %xmm0, -20(%edx)
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+# endif
+ RETURN_END
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(large_page):
+ movdqu (%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
+ lea 16(%eax), %eax
+ movntdq %xmm1, (%edx)
+ lea 16(%edx), %edx
+ lea -0x90(%ecx), %ecx
+ POP (%edi)
+
+ .p2align 4
+L(large_page_loop):
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jb L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ .p2align 4
+L(bk_write_44bytes):
+ movq 36(%eax), %xmm0
+ movq %xmm0, 36(%edx)
+L(bk_write_36bytes):
+ movq 28(%eax), %xmm0
+ movq %xmm0, 28(%edx)
+L(bk_write_28bytes):
+ movq 20(%eax), %xmm0
+ movq %xmm0, 20(%edx)
+L(bk_write_20bytes):
+ movq 12(%eax), %xmm0
+ movq %xmm0, 12(%edx)
+L(bk_write_12bytes):
+ movq 4(%eax), %xmm0
+ movq %xmm0, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_40bytes):
+ movq 32(%eax), %xmm0
+ movq %xmm0, 32(%edx)
+L(bk_write_32bytes):
+ movq 24(%eax), %xmm0
+ movq %xmm0, 24(%edx)
+L(bk_write_24bytes):
+ movq 16(%eax), %xmm0
+ movq %xmm0, 16(%edx)
+L(bk_write_16bytes):
+ movq 8(%eax), %xmm0
+ movq %xmm0, 8(%edx)
+L(bk_write_8bytes):
+ movq (%eax), %xmm0
+ movq %xmm0, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_45bytes):
+ movq 37(%eax), %xmm0
+ movq %xmm0, 37(%edx)
+L(bk_write_37bytes):
+ movq 29(%eax), %xmm0
+ movq %xmm0, 29(%edx)
+L(bk_write_29bytes):
+ movq 21(%eax), %xmm0
+ movq %xmm0, 21(%edx)
+L(bk_write_21bytes):
+ movq 13(%eax), %xmm0
+ movq %xmm0, 13(%edx)
+L(bk_write_13bytes):
+ movq 5(%eax), %xmm0
+ movq %xmm0, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_41bytes):
+ movq 33(%eax), %xmm0
+ movq %xmm0, 33(%edx)
+L(bk_write_33bytes):
+ movq 25(%eax), %xmm0
+ movq %xmm0, 25(%edx)
+L(bk_write_25bytes):
+ movq 17(%eax), %xmm0
+ movq %xmm0, 17(%edx)
+L(bk_write_17bytes):
+ movq 9(%eax), %xmm0
+ movq %xmm0, 9(%edx)
+L(bk_write_9bytes):
+ movq 1(%eax), %xmm0
+ movq %xmm0, 1(%edx)
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_46bytes):
+ movq 38(%eax), %xmm0
+ movq %xmm0, 38(%edx)
+L(bk_write_38bytes):
+ movq 30(%eax), %xmm0
+ movq %xmm0, 30(%edx)
+L(bk_write_30bytes):
+ movq 22(%eax), %xmm0
+ movq %xmm0, 22(%edx)
+L(bk_write_22bytes):
+ movq 14(%eax), %xmm0
+ movq %xmm0, 14(%edx)
+L(bk_write_14bytes):
+ movq 6(%eax), %xmm0
+ movq %xmm0, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_42bytes):
+ movq 34(%eax), %xmm0
+ movq %xmm0, 34(%edx)
+L(bk_write_34bytes):
+ movq 26(%eax), %xmm0
+ movq %xmm0, 26(%edx)
+L(bk_write_26bytes):
+ movq 18(%eax), %xmm0
+ movq %xmm0, 18(%edx)
+L(bk_write_18bytes):
+ movq 10(%eax), %xmm0
+ movq %xmm0, 10(%edx)
+L(bk_write_10bytes):
+ movq 2(%eax), %xmm0
+ movq %xmm0, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_47bytes):
+ movq 39(%eax), %xmm0
+ movq %xmm0, 39(%edx)
+L(bk_write_39bytes):
+ movq 31(%eax), %xmm0
+ movq %xmm0, 31(%edx)
+L(bk_write_31bytes):
+ movq 23(%eax), %xmm0
+ movq %xmm0, 23(%edx)
+L(bk_write_23bytes):
+ movq 15(%eax), %xmm0
+ movq %xmm0, 15(%edx)
+L(bk_write_15bytes):
+ movq 7(%eax), %xmm0
+ movq %xmm0, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(bk_write_43bytes):
+ movq 35(%eax), %xmm0
+ movq %xmm0, 35(%edx)
+L(bk_write_35bytes):
+ movq 27(%eax), %xmm0
+ movq %xmm0, 27(%edx)
+L(bk_write_27bytes):
+ movq 19(%eax), %xmm0
+ movq %xmm0, 19(%edx)
+L(bk_write_19bytes):
+ movq 11(%eax), %xmm0
+ movq %xmm0, 11(%edx)
+L(bk_write_11bytes):
+ movq 3(%eax), %xmm0
+ movq %xmm0, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+# ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+# endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ .p2align 2
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ .p2align 2
+L(table_48bytes_fwd_align):
+ .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+ .p2align 2
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ .p2align 2
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+# ifdef USE_AS_MEMMOVE
+ .p2align 4
+L(copy_backward):
+ PUSH (%edi)
+ movl %eax, %edi
+ lea (%ecx,%edx,1),%edx
+ lea (%ecx,%edi,1),%edi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jae L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jb L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movq -8(%edi), %xmm0
+ movq %xmm0, -8(%edx)
+ movq -16(%edi), %xmm0
+ movq %xmm0, -16(%edx)
+ movq -24(%edi), %xmm0
+ movq %xmm0, -24(%edx)
+ movq -32(%edi), %xmm0
+ movq %xmm0, -32(%edx)
+ sub $32, %edx
+ sub $32, %edi
+
+L(bk_write_less32bytes):
+ movl %edi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%edi)
+L(bk_write_less32bytes_2):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(bk_align):
+ cmp $8, %ecx
+ jbe L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %edi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%edi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %edi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%edi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ .p2align 4
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %edi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%edi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jb L(bk_write_more32bytes)
+
+ .p2align 4
+L(bk_ssse3_cpy):
+ sub $64, %edi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%edi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%edi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%edi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%edi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jae L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 0000000000..f725944620
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,78 @@
+/* Multiple versions of memcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need memcpy before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(memcpy)
+ .type memcpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2: ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcpy_ia32, @function; \
+ .p2align 4; \
+ .globl __memcpy_ia32; \
+ .hidden __memcpy_ia32; \
+ __memcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memcpy_chk_ia32, @function; \
+ .globl __memcpy_chk_ia32; \
+ .p2align 4; \
+ __memcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 0000000000..1b4fbe2e6f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __memcpy_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch memcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+ .text
+ENTRY(__memcpy_chk)
+ .type __memcpy_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2: ret
+END(__memcpy_chk)
+# else
+# include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000000..3873594cb2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_sse2_unaligned
+#define MEMCPY_CHK __memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 0000000000..d202fc4a13
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3_rep
+#define MEMCPY_CHK __memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000000..295430b1ef
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3
+#define MEMCPY_CHK __memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 0000000000..6eb418ca7f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,89 @@
+/* Multiple versions of memmove
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(memmove)
+ .type memmove, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2: ret
+END(memmove)
+
+# ifdef SHARED
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .p2align 4; \
+ .globl __memmove_ia32; \
+ .hidden __memmove_ia32; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# else
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .globl __memmove_ia32; \
+ .p2align 4; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memmove_chk_ia32, @function; \
+ .globl __memmove_chk_ia32; \
+ .p2align 4; \
+ __memmove_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 0000000000..314834c4c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,94 @@
+/* Multiple versions of __memmove_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__memmove_chk)
+ .type __memmove_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
+2: ret
+END(__memmove_chk)
+
+# ifndef SHARED
+ .type __memmove_chk_sse2_unaligned, @function
+ .p2align 4;
+__memmove_chk_sse2_unaligned:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_sse2_unaligned
+ cfi_endproc
+ .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
+ .type __memmove_chk_ssse3, @function
+ .p2align 4;
+__memmove_chk_ssse3:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3
+ cfi_endproc
+ .size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+ .type __memmove_chk_ssse3_rep, @function
+ .p2align 4;
+__memmove_chk_ssse3_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3_rep
+ cfi_endproc
+ .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+ .type __memmove_chk_ia32, @function
+ .p2align 4;
+__memmove_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ia32
+ cfi_endproc
+ .size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000000..a1cea50771
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_sse2_unaligned
+#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 0000000000..5357b33e18
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3_rep
+#define MEMCPY_CHK __mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000000..822d98e954
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3
+#define MEMCPY_CHK __mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 0000000000..06e377fbc9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,81 @@
+/* Multiple versions of mempcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need mempcpy before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(__mempcpy)
+ .type __mempcpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2: ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __mempcpy_ia32, @function; \
+ .p2align 4; \
+ .globl __mempcpy_ia32; \
+ .hidden __mempcpy_ia32; \
+ __mempcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __mempcpy_chk_ia32, @function; \
+ .globl __mempcpy_chk_ia32; \
+ .p2align 4; \
+ __mempcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000000..e13e5248a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,50 @@
+/* Multiple versions of __mempcpy_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch mempcpy functions for static binaries.
+ */
+#if IS_IN (libc)
+# ifdef SHARED
+ .text
+ENTRY(__mempcpy_chk)
+ .type __mempcpy_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2: ret
+END(__mempcpy_chk)
+# else
+# include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000000..ef7bbbe792
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#if IS_IN (libc)
+# define MEMRCHR __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000000..dbbe94fd08
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,417 @@
+/* Optimized memrchr with sse2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+ .text
+ENTRY (MEMCHR)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ add $16, %ecx
+ add $16, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ add $64, %ecx
+ add $64, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ bsr %eax, %eax
+
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches0):
+ bsr %eax, %eax
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsr %eax, %eax
+ lea 16(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsr %eax, %eax
+ lea 32(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ bsr %eax, %eax
+ lea 48(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ bsr %eax, %eax
+ sub $64, %edx
+ add %eax, %edx
+ jl L(return_null)
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ bsr %eax, %eax
+ sub $48, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 16(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ bsr %eax, %eax
+ sub $32, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 32(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ bsr %eax, %eax
+ sub $16, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 48(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+ mov %edx, %ecx
+
+ pmovmskb %xmm1, %edx
+
+ and %ecx, %edx
+ test %edx, %edx
+ jz L(return_null)
+
+ bsr %edx, %ecx
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ mov %ecx, %eax
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ jz L(return_null)
+
+ pshufd $0, %xmm1, %xmm1
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (MEMCHR)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000000..5f7853f683
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,724 @@
+/* Optimized memrchr with sse2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__memrchr_sse2)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ lea 16(%ecx), %ecx
+ lea 16(%edx), %edx
+ sub %eax, %edx
+ and $-16, %ecx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ lea 64(%ecx), %ecx
+ lea 64(%edx), %edx
+ and $-64, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ lea 16(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ lea 32(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch):
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_8):
+ test $0x80, %al
+ jnz L(exit_8)
+ test $0x40, %al
+ jnz L(exit_7)
+ test $0x20, %al
+ jnz L(exit_6)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(exit_dispatch_high_8)
+ test $0x08, %ah
+ jnz L(exit_12)
+ test $0x04, %ah
+ jnz L(exit_11)
+ test $0x02, %ah
+ jnz L(exit_10)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high_8):
+ test $0x80, %ah
+ jnz L(exit_16)
+ test $0x40, %ah
+ jnz L(exit_15)
+ test $0x20, %ah
+ jnz L(exit_14)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_2):
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_4):
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_6):
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_7):
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_8):
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_10):
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_11):
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_12):
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_14):
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_15):
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_16):
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ lea -64(%edx), %edx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ lea -48(%edx), %edx
+ lea 16(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ lea -32(%edx), %edx
+ lea 32(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ lea -16(%edx), %edx
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch_1):
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_8):
+ test $0x80, %al
+ jnz L(exit_1_8)
+ test $0x40, %al
+ jnz L(exit_1_7)
+ test $0x20, %al
+ jnz L(exit_1_6)
+ add $4, %edx
+ jl L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high):
+ mov %ah, %al
+ and $15 << 4, %al
+ jnz L(exit_dispatch_1_high_8)
+ test $0x08, %ah
+ jnz L(exit_1_12)
+ test $0x04, %ah
+ jnz L(exit_1_11)
+ test $0x02, %ah
+ jnz L(exit_1_10)
+ add $8, %edx
+ jl L(return_null)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high_8):
+ test $0x80, %ah
+ jnz L(exit_1_16)
+ test $0x40, %ah
+ jnz L(exit_1_15)
+ test $0x20, %ah
+ jnz L(exit_1_14)
+ add $12, %edx
+ jl L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_2):
+ add $1, %edx
+ jl L(return_null)
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_3):
+ add $2, %edx
+ jl L(return_null)
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_4):
+ add $3, %edx
+ jl L(return_null)
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_6):
+ add $5, %edx
+ jl L(return_null)
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_7):
+ add $6, %edx
+ jl L(return_null)
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_8):
+ add $7, %edx
+ jl L(return_null)
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_10):
+ add $9, %edx
+ jl L(return_null)
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_11):
+ add $10, %edx
+ jl L(return_null)
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_12):
+ add $11, %edx
+ jl L(return_null)
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_14):
+ add $13, %edx
+ jl L(return_null)
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_15):
+ add $14, %edx
+ jl L(return_null)
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_16):
+ add $15, %edx
+ jl L(return_null)
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ mov %eax, %ecx
+ pmovmskb %xmm1, %eax
+
+ and %edx, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ je L(return_null)
+ punpcklbw %xmm1, %xmm1
+
+ mov %ecx, %eax
+ pshufd $0, %xmm1, %xmm1
+
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000000..d4253a553b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,45 @@
+/* Multiple versions of memrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__memrchr)
+ .type __memrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX (__memrchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__memrchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
+ ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 0000000000..3221077e49
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,811 @@
+/* memset with SSE2 and REP string.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjusted EDX. Go. */ \
+ jmp *%ebx
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jae L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+ PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+ PUSH (%ebx)
+ mov $DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_data_cache_size, %ebx
+# endif
+#endif
+ mov %ebx, %edi
+ shr $4, %ebx
+ sub %ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+ cmp %edi, %ecx
+ jae L(128bytesormore_endof_L1)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jb L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jae L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ POP (%edi)
+ add $128, %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ CFI_PUSH (%edi)
+ ALIGN (4)
+L(128bytesormore_endof_L1):
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep stosl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movw %ax, (%edi)
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%edi)
+ SETRTNVAL
+ RETURN
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 0000000000..d7b8be9114
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,860 @@
+/* memset with SSE2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjusted EDX. Go. */ \
+ jmp *%ebx
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jae L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+ PUSH (%ebx)
+ mov $SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_shared_cache_size, %ebx
+# endif
+#endif
+ cmp %ebx, %ecx
+ jae L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+ POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+ cmp $DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+# define RESTORE_EBX_STATE
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+ POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
+ cmp __x86_data_cache_size, %ecx
+# endif
+#endif
+
+ jae L(128bytes_L2_normal)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jb L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jae L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ add $128, %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytes_L2_normal):
+ prefetcht0 0x380(%edx)
+ prefetcht0 0x3c0(%edx)
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
+ add $128, %edx
+ cmp $128, %ecx
+ jae L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ RESTORE_EBX_STATE
+L(128bytesormore_nt_start):
+ sub %ebx, %ecx
+ ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+ prefetcht0 0x3c0(%edx)
+ prefetcht0 0x380(%edx)
+ sub $0x80, %ebx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ebx
+ jae L(128bytesormore_shared_cache_loop)
+ cmp $0x80, %ecx
+ jb L(shared_cache_loop_end)
+ ALIGN (4)
+L(128bytesormore_nt):
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm0, 0x10(%edx)
+ movntdq %xmm0, 0x20(%edx)
+ movntdq %xmm0, 0x30(%edx)
+ movntdq %xmm0, 0x40(%edx)
+ movntdq %xmm0, 0x50(%edx)
+ movntdq %xmm0, 0x60(%edx)
+ movntdq %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ecx
+ jae L(128bytesormore_nt)
+ sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 0000000000..f601663a9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,75 @@
+/* Multiple versions of memset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
+2: ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memset_ia32, @function; \
+ .globl __memset_ia32; \
+ .p2align 4; \
+ __memset_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memset_chk_ia32, @function; \
+ .globl __memset_chk_ia32; \
+ .p2align 4; \
+ __memset_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 0000000000..573cf4208a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,82 @@
+/* Multiple versions of __memset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#if IS_IN (libc)
+ .text
+ENTRY(__memset_chk)
+ .type __memset_chk, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+ HAS_ARCH_FEATURE (Fast_Rep_String)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2: ret
+END(__memset_chk)
+
+# ifdef SHARED
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+ .text
+ .type __memset_chk_sse2, @function
+ .p2align 4;
+__memset_chk_sse2:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2
+ cfi_endproc
+ .size __memset_chk_sse2, .-__memset_chk_sse2
+
+ .type __memset_chk_sse2_rep, @function
+ .p2align 4;
+__memset_chk_sse2_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2_rep
+ cfi_endproc
+ .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+ .type __memset_chk_ia32, @function
+ .p2align 4;
+__memset_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_ia32
+ cfi_endproc
+ .size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000000..88c0e5776c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000000..038c74896b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000000..0a41d63ee8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,65 @@
+/* Multiple versions of rawmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__rawmemchr)
+ .type __rawmemchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 3f
+
+ LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
+ ret
+
+2: LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
+ ret
+
+3: LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
+ ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __rawmemchr_ia32, @function; \
+ .globl __rawmemchr_ia32; \
+ .p2align 4; \
+ __rawmemchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 0000000000..1aa5440644
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..2e9619f97c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fma.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+double
+__fma_fma (double x, double y, double z)
+{
+ asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..411ebb2ba9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fma.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma,
+ HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+#define __fma __fma_ia32
+
+#include <sysdeps/ieee754/ldbl-96/s_fma.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..ee57abfda2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,27 @@
+/* FMA version of fmaf.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+float
+__fmaf_fma (float x, float y, float z)
+{
+ asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..00b0fbcfc5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,34 @@
+/* Multiple versions of fmaf.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf,
+ HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+#define __fmaf __fmaf_ia32
+
+#include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
new file mode 100644
index 0000000000..7db31b02f8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/sched_cpucount.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..ee81ab6ae3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,9 @@
+/* Multiple versions of stpcpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..2698ca6a8c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,8 @@
+/* Multiple versions of stpncpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
new file mode 100644
index 0000000000..753c6ec84a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c
@@ -0,0 +1,12 @@
+#include <string.h>
+
+extern __typeof (strcasecmp) __strcasecmp_nonascii;
+
+#define __strcasecmp __strcasecmp_nonascii
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strcasecmp_nonascii, __GI___strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
new file mode 100644
index 0000000000..ec59276408
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strcasecmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY(__strcasecmp)
+ .type __strcasecmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
+2: ret
+END(__strcasecmp)
+
+weak_alias (__strcasecmp, strcasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
new file mode 100644
index 0000000000..d4fcd2b4a1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii;
+
+#define __strcasecmp_l __strcasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL 1
+#include <string/strcasecmp.c>
+
+strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
new file mode 100644
index 0000000000..411d4153f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000000..a22b93c518
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000000..711c09b0dc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strcasecmp_l
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
new file mode 100644
index 0000000000..6359c7330c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S
@@ -0,0 +1,1245 @@
+/* strcat with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ SETUP_PIC_REG(cx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjusted ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+# ifndef STRCAT
+# define STRCAT __strcat_sse2
+# endif
+
+# define PARMS 4
+# define STR1 PARMS+4
+# define STR2 STR1+4
+
+# ifdef USE_AS_STRNCAT
+# define LEN STR2+8
+# define STR3 STR1+4
+# else
+# define STR3 STR1
+# endif
+
+# define USE_AS_STRCAT
+# ifdef USE_AS_STRNCAT
+# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
+# else
+# define RETURN POP(%esi); ret; CFI_PUSH(%esi);
+# endif
+
+.text
+ENTRY (STRCAT)
+ PUSH (%esi)
+ mov STR1(%esp), %eax
+ mov STR2(%esp), %esi
+# ifdef USE_AS_STRNCAT
+ PUSH (%ebx)
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+# endif
+ cmpb $0, (%esi)
+ mov %esi, %ecx
+ mov %eax, %edx
+ jz L(ExitZero)
+
+ and $63, %ecx
+ and $63, %edx
+ cmp $32, %ecx
+ ja L(StrlenCore7_1)
+ cmp $48, %edx
+ ja L(alignment_prolog)
+
+ pxor %xmm0, %xmm0
+ pxor %xmm4, %xmm4
+ pxor %xmm7, %xmm7
+ movdqu (%eax), %xmm1
+ movdqu (%esi), %xmm5
+ pcmpeqb %xmm1, %xmm0
+ movdqu 16(%esi), %xmm6
+ pmovmskb %xmm0, %ecx
+ pcmpeqb %xmm5, %xmm4
+ pcmpeqb %xmm6, %xmm7
+ test %ecx, %ecx
+ jnz L(exit_less16_)
+ mov %eax, %ecx
+ and $-16, %eax
+ jmp L(loop_prolog)
+
+L(alignment_prolog):
+ pxor %xmm0, %xmm0
+ pxor %xmm4, %xmm4
+ mov %edx, %ecx
+ pxor %xmm7, %xmm7
+ and $15, %ecx
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ movdqu (%esi), %xmm5
+ movdqu 16(%esi), %xmm6
+ pmovmskb %xmm0, %edx
+ pcmpeqb %xmm5, %xmm4
+ shr %cl, %edx
+ pcmpeqb %xmm6, %xmm7
+ test %edx, %edx
+ jnz L(exit_less16)
+ add %eax, %ecx
+
+ pxor %xmm0, %xmm0
+L(loop_prolog):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ .p2align 4
+L(align16_loop):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop)
+ bsf %edx, %edx
+ add %edx, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit16):
+ bsf %edx, %edx
+ lea 16(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit32):
+ bsf %edx, %edx
+ lea 32(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit48):
+ bsf %edx, %edx
+ lea 48(%eax, %edx), %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_less16):
+ bsf %edx, %edx
+ add %ecx, %eax
+ add %edx, %eax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_less16_):
+ bsf %ecx, %ecx
+ add %ecx, %eax
+
+ .p2align 4
+L(StartStrcpyPart):
+ pmovmskb %xmm4, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ movdqu %xmm5, (%eax)
+ pmovmskb %xmm7, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ mov %esi, %ecx
+ and $-16, %esi
+ and $15, %ecx
+ pxor %xmm0, %xmm0
+# ifdef USE_AS_STRNCAT
+ add %ecx, %ebx
+ sbb %edx, %edx
+ or %edx, %ebx
+# endif
+ sub %ecx, %eax
+ jmp L(Unalign16Both)
+
+L(StrlenCore7_1):
+ mov %eax, %ecx
+ pxor %xmm0, %xmm0
+ and $15, %ecx
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ shr %cl, %edx
+ test %edx, %edx
+ jnz L(exit_less16_1)
+ add %eax, %ecx
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+
+ .p2align 4
+L(align16_loop_1):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16_1)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32_1)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48_1)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop_1)
+ bsf %edx, %edx
+ add %edx, %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit16_1):
+ bsf %edx, %edx
+ lea 16(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit32_1):
+ bsf %edx, %edx
+ lea 32(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit48_1):
+ bsf %edx, %edx
+ lea 48(%eax, %edx), %eax
+ jmp L(StartStrcpyPart_1)
+
+ .p2align 4
+L(exit_less16_1):
+ bsf %edx, %edx
+ add %ecx, %eax
+ add %edx, %eax
+
+ .p2align 4
+L(StartStrcpyPart_1):
+ mov %esi, %ecx
+ and $15, %ecx
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+# ifdef USE_AS_STRNCAT
+ cmp $48, %ebx
+ ja L(BigN)
+# endif
+ pcmpeqb (%esi), %xmm1
+# ifdef USE_AS_STRNCAT
+ add %ecx, %ebx
+# endif
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STRNCAT
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%eax)
+ sub %ecx, %eax
+
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+L(Unalign16BothBigN):
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%eax, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%eax, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%eax, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%eax, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+# ifdef USE_AS_STRNCAT
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm3, (%eax, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %eax
+# ifdef USE_AS_STRNCAT
+ lea 128(%ebx, %edx), %ebx
+# endif
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+
+ .p2align 4
+L(Unaligned64Loop_start):
+ add $64, %eax
+ add $64, %esi
+ movdqu %xmm4, -64(%eax)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%eax)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%eax)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%eax)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+# ifdef USE_AS_STRNCAT
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%eax)
+ movdqu %xmm5, 16(%eax)
+ movdqu %xmm6, 32(%eax)
+ add $48, %esi
+ add $48, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+ .p2align 4
+L(BigN):
+ pcmpeqb (%esi), %xmm1
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%eax)
+ sub %ecx, %eax
+ sub $48, %ebx
+ add %ecx, %ebx
+
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%eax, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+ jmp L(Unalign16BothBigN)
+# endif
+
+/*------------end of main part-------------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ecx, %eax
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %eax
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%eax)
+ add $16, %esi
+ add $16, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%eax)
+ movdqu %xmm5, 16(%eax)
+ add $32, %esi
+ add $32, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+# ifdef USE_AS_STRNCAT
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %eax
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %eax
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %eax
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+
+# endif
+
+# ifdef USE_AS_STRNCAT
+ .p2align 4
+L(StrncatExit0):
+ movb %bh, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+# endif
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit1):
+ movb %bh, 1(%eax)
+# endif
+L(Exit1):
+# ifdef USE_AS_STRNCAT
+ movb (%esi), %dh
+# endif
+ movb %dh, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit2):
+ movb %bh, 2(%eax)
+# endif
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit3):
+ movb %bh, 3(%eax)
+# endif
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%eax)
+# ifdef USE_AS_STRNCAT
+ movb 2(%esi), %dh
+# endif
+ movb %dh, 2(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit4):
+ movb %bh, 4(%eax)
+# endif
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit5):
+ movb %bh, 5(%eax)
+# endif
+L(Exit5):
+ movl (%esi), %ecx
+# ifdef USE_AS_STRNCAT
+ movb 4(%esi), %dh
+# endif
+ movb %dh, 4(%eax)
+ movl %ecx, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit6):
+ movb %bh, 6(%eax)
+# endif
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%eax)
+ movw %dx, 4(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit7):
+ movb %bh, 7(%eax)
+# endif
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%eax)
+ movl %edx, 3(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit8):
+ movb %bh, 8(%eax)
+# endif
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit9):
+ movb %bh, 9(%eax)
+# endif
+L(Exit9):
+ movlpd (%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+ movb 8(%esi), %dh
+# endif
+ movb %dh, 8(%eax)
+ movlpd %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit10):
+ movb %bh, 10(%eax)
+# endif
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%eax)
+ movw %dx, 8(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit11):
+ movb %bh, 11(%eax)
+# endif
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%eax)
+ movl %edx, 7(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit12):
+ movb %bh, 12(%eax)
+# endif
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%eax)
+ movl %edx, 8(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit13):
+ movb %bh, 13(%eax)
+# endif
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 5(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit14):
+ movb %bh, 14(%eax)
+# endif
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 6(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit15):
+ movb %bh, 15(%eax)
+# endif
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%eax)
+ movlpd %xmm1, 7(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit16):
+ movb %bh, 16(%eax)
+# endif
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit17):
+ movb %bh, 17(%eax)
+# endif
+L(Exit17):
+ movdqu (%esi), %xmm0
+# ifdef USE_AS_STRNCAT
+ movb 16(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movb %dh, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit18):
+ movb %bh, 18(%eax)
+# endif
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%eax)
+ movw %cx, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit19):
+ movb %bh, 19(%eax)
+# endif
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movl %ecx, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit20):
+ movb %bh, 20(%eax)
+# endif
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movl %ecx, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit21):
+ movb %bh, 21(%eax)
+# endif
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+# ifdef USE_AS_STRNCAT
+ movb 20(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movl %ecx, 16(%eax)
+ movb %dh, 20(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit22):
+ movb %bh, 22(%eax)
+# endif
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%eax)
+ movlpd %xmm3, 14(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit23):
+ movb %bh, 23(%eax)
+# endif
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%eax)
+ movlpd %xmm3, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit24):
+ movb %bh, 24(%eax)
+# endif
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit25):
+ movb %bh, 25(%eax)
+# endif
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+# ifdef USE_AS_STRNCAT
+ movb 24(%esi), %dh
+# endif
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movb %dh, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit26):
+ movb %bh, 26(%eax)
+# endif
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movw %cx, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit27):
+ movb %bh, 27(%eax)
+# endif
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movl %ecx, 23(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit28):
+ movb %bh, 28(%eax)
+# endif
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%eax)
+ movlpd %xmm2, 16(%eax)
+ movl %ecx, 24(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit29):
+ movb %bh, 29(%eax)
+# endif
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 13(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit30):
+ movb %bh, 30(%eax)
+# endif
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 14(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit31):
+ movb %bh, 31(%eax)
+# endif
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 15(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+# ifdef USE_AS_STRNCAT
+L(StrncatExit32):
+ movb %bh, 32(%eax)
+# endif
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%eax)
+ movdqu %xmm2, 16(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+# ifdef USE_AS_STRNCAT
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%eax)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%eax)
+ xor %bh, %bh
+ movb %bh, 64(%eax)
+ mov STR3(%esp), %eax
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%eax)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%eax)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%eax)
+ lea 16(%eax, %ecx), %eax
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
+# endif
+ .p2align 4
+L(ExitZero):
+ RETURN
+
+END (STRCAT)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCAT
+L(ExitStrncatTable):
+ .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
+ .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000000..59ffbc60a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S
@@ -0,0 +1,572 @@
+/* strcat with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCAT
+# define STRCAT __strcat_ssse3
+# endif
+
+# define PARMS 4
+# define STR1 PARMS+4
+# define STR2 STR1+4
+
+# ifdef USE_AS_STRNCAT
+# define LEN STR2+8
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+ PUSH (%edi)
+ mov STR1(%esp), %edi
+ mov %edi, %edx
+
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-sse2.S"
+
+L(StartStrcpyPart):
+ mov STR2(%esp), %ecx
+ lea (%edi, %eax), %edx
+# ifdef USE_AS_STRNCAT
+ PUSH (%ebx)
+ mov LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(StrncatExit0)
+ cmp $8, %ebx
+ jbe L(StrncatExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmpb $0, 7(%ecx)
+ jz L(Exit8)
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jb L(StrncatExit15Bytes)
+# endif
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmpb $0, 14(%ecx)
+ jz L(Exit15)
+ cmpb $0, 15(%ecx)
+ jz L(Exit16)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ je L(StrncatExit16)
+
+# define RETURN1 \
+ POP (%ebx); \
+ POP (%edi); \
+ ret; \
+ CFI_PUSH (%ebx); \
+ CFI_PUSH (%edi)
+# define USE_AS_STRNCPY
+# else
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+# include "strcpy-ssse3.S"
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit1):
+ movb %bh, 1(%edx)
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit2):
+ movb %bh, 2(%edx)
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit3):
+ movb %bh, 3(%edx)
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit4):
+ movb %bh, 4(%edx)
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit5):
+ movb %bh, 5(%edx)
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit6):
+ movb %bh, 6(%edx)
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit7):
+ movb %bh, 7(%edx)
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit8):
+ movb %bh, 8(%edx)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit9):
+ movb %bh, 9(%edx)
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit10):
+ movb %bh, 10(%edx)
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit11):
+ movb %bh, 11(%edx)
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit12):
+ movb %bh, 12(%edx)
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit13):
+ movb %bh, 13(%edx)
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit14):
+ movb %bh, 14(%edx)
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit15):
+ movb %bh, 15(%edx)
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit16):
+ movb %bh, 16(%edx)
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ lea (%esi, %edx), %esi
+ lea -9(%ebx), %edx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%esi), %edx
+ POP (%esi)
+ jz L(ExitHighCase2)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ xor %cl, %cl
+ movb %cl, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase2):
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHighCase3)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb %bh, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase3):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movb %bh, 16(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit0):
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit15Bytes):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit8Bytes):
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+# endif
+END (STRCAT)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
new file mode 100644
index 0000000000..8412cb6f23
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S
@@ -0,0 +1,92 @@
+/* Multiple versions of strcat
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+# define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3 __strncat_ssse3
+# define STRCAT_SSE2 __strncat_sse2
+# define STRCAT_IA32 __strncat_ia32
+# define __GI_STRCAT __GI_strncat
+#else
+# define STRCAT_SSSE3 __strcat_ssse3
+# define STRCAT_SSE2 __strcat_sse2
+# define STRCAT_IA32 __strcat_ia32
+# define __GI_STRCAT __GI_strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncat in static library since we
+ need strncat before the initialization happened. */
+#if IS_IN (libc)
+
+ .text
+ENTRY(STRCAT)
+ .type STRCAT, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
+2: ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCAT_IA32, @function; \
+ .align 16; \
+ .globl STRCAT_IA32; \
+ .hidden STRCAT_IA32; \
+ STRCAT_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32
+
+# endif
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../../strcat.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
new file mode 100644
index 0000000000..95fd7c084e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S
@@ -0,0 +1,158 @@
+/* strchr with SSE2 with bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi)
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ .text
+ENTRY (__strchr_sse2_bsf)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $15, %ecx
+ pshufd $0, %xmm1, %xmm1
+ je L(loop)
+
+/* Handle unaligned string. */
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+ /* Check which byte is a match. */
+ bsf %eax, %eax
+ /* Is there a NULL? */
+ test %edx, %edx
+ je L(unaligned_match)
+ bsf %edx, %edx
+ cmpl %edx, %eax
+ /* Return NULL if NULL comes first. */
+ ja L(return_null)
+L(unaligned_match):
+ add %edi, %eax
+ add %ecx, %eax
+ RETURN
+
+ .p2align 4
+L(unaligned_no_match):
+ test %edx, %edx
+ jne L(return_null)
+ pxor %xmm2, %xmm2
+
+ add $16, %edi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ jmp L(loop)
+
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ bsf %eax, %eax
+ /* There is a match. First find where NULL is. */
+ test %edx, %edx
+ je L(match)
+ bsf %edx, %ecx
+ /* Check if NULL comes first. */
+ cmpl %ecx, %eax
+ ja L(return_null)
+L(match):
+ sub $16, %edi
+ add %edi, %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+END (__strchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
new file mode 100644
index 0000000000..1f9e875b04
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -0,0 +1,348 @@
+/* strchr SSE2 without bsf
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi)
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__strchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $15, %ecx
+ pshufd $0, %xmm1, %xmm1
+ je L(loop)
+
+/* Handle unaligned string. */
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+ /* Check which byte is a match. */
+ /* Is there a NULL? */
+ add %ecx, %edi
+ test %edx, %edx
+ jz L(match_case1)
+ jmp L(match_case2)
+
+ .p2align 4
+L(unaligned_no_match):
+ test %edx, %edx
+ jne L(return_null)
+
+ pxor %xmm2, %xmm2
+ add $16, %edi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+ jmp L(loop)
+
+L(matches):
+ /* There is a match. First find where NULL is. */
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+
+ mov %al, %cl
+ and $15, %cl
+ jnz L(match_case2_4)
+
+ mov %dl, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x10, %dl
+ jnz L(return_null)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x20, %dl
+ jnz L(return_null)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x40, %dl
+ jnz L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_4):
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x01, %dl
+ jnz L(return_null)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x02, %dl
+ jnz L(return_null)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x04, %dl
+ jnz L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+
+ mov %ah, %cl
+ and $15, %cl
+ jnz L(match_case2_12)
+
+ mov %dh, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x10, %dh
+ jnz L(return_null)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x20, %dh
+ jnz L(return_null)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x40, %dh
+ jnz L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_12):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x01, %dh
+ jnz L(return_null)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x02, %dh
+ jnz L(return_null)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x04, %dh
+ jnz L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ lea (%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea 14(%edi), %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+END (__strchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
new file mode 100644
index 0000000000..5b97b1c767
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(strchr)
+ .type strchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2: ret
+END(strchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strchr_ia32, @function; \
+ .globl __strchr_ia32; \
+ .p2align 4; \
+ __strchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strchr_ia32, .-__strchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strchr; __GI_strchr = __strchr_ia32
+#endif
+
+#include "../../i586/strchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
new file mode 100644
index 0000000000..cd26058671
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -0,0 +1,804 @@
+/* strcmp with SSE4.2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+# define STRCMP __strncmp_sse4_2
+# endif
+# define STR1 8
+# define STR2 STR1+4
+# define CNT STR2+4
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define REM %ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strcasecmp_l_sse4_2
+# endif
+# ifdef PIC
+# define STR1 12
+# else
+# define STR1 8
+# endif
+# define STR2 STR1+4
+# define LOCALE 12 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%edi); POP (%ebx); ret; \
+ .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+# define RETURN POP (%edi); ret; .p2align 4; CFI_PUSH (%edi)
+# endif
+# define NONASCII __strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strncasecmp_l_sse4_2
+# endif
+# ifdef PIC
+# define STR1 16
+# else
+# define STR1 12
+# endif
+# define STR2 STR1+4
+# define CNT STR2+4
+# define LOCALE 16 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%edi); POP (REM); POP (%ebx); ret; \
+ .p2align 4; \
+ CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi)
+# else
+# define RETURN POP (%edi); POP (REM); ret; \
+ .p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi)
+# endif
+# define REM %ebp
+# define NONASCII __strncasecmp_nonascii
+#else
+# ifndef STRCMP
+# define STRCMP __strcmp_sse4_2
+# endif
+# define STR1 4
+# define STR2 STR1+4
+# define RETURN ret; .p2align 4
+#endif
+
+ .section .text.sse4.2,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_sse4_2)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strcasecmp_nonascii
+# else
+ jne __strcasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strcasecmp_sse4_2)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_sse4_2)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strncasecmp_nonascii
+# else
+ jne __strncasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strncasecmp_sse4_2)
+#endif
+
+ ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movl LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+ jne NONASCII
+
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Lbelowupper:
+ .quad 0x4040404040404040
+ .quad 0x4040404040404040
+.Ltopupper:
+ .quad 0x5b5b5b5b5b5b5b5b
+ .quad 0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+# ifdef PIC
+# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+# define UCLOW_reg .Lbelowupper
+# define UCHIGH_reg .Ltopupper
+# define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ PUSH (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ PUSH (%edi)
+#endif
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ movl CNT(%esp), REM
+ test REM, REM
+ je L(eq)
+#endif
+ mov %dx, %cx
+ and $0xfff, %cx
+ cmp $0xff0, %cx
+ ja L(first4bytes)
+ movdqu (%edx), %xmm2
+ mov %eax, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(first4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa reg1, %xmm3; \
+ movdqa UCHIGH_reg, %xmm4; \
+ movdqa reg2, %xmm5; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb UCLOW_reg, %xmm3; \
+ pcmpgtb reg1, %xmm4; \
+ pcmpgtb UCLOW_reg, %xmm5; \
+ pcmpgtb reg2, %xmm6; \
+ pand %xmm4, %xmm3; \
+ pand %xmm6, %xmm5; \
+ pand LCQWORD_reg, %xmm3; \
+ pand LCQWORD_reg, %xmm5; \
+ por %xmm3, reg1; \
+ por %xmm5, reg2
+
+ movdqu (%eax), %xmm1
+ TOLOWER (%xmm2, %xmm1)
+ movd %xmm2, %ecx
+ movd %xmm1, %edi
+ movdqa %xmm2, %xmm3
+ movdqa %xmm1, %xmm4
+ cmpl %edi, %ecx
+#else
+# define TOLOWER(reg1, reg)
+
+ movd %xmm2, %ecx
+ cmp (%eax), %ecx
+#endif
+ jne L(less4bytes)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ movdqu (%eax), %xmm1
+#endif
+ pxor %xmm2, %xmm1
+ pxor %xmm0, %xmm0
+ ptest %xmm1, %xmm0
+ jnc L(less16bytes)
+ pcmpeqb %xmm0, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, REM
+ jbe L(eq)
+#endif
+ add $16, %edx
+ add $16, %eax
+L(first4bytes):
+ movzbl (%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl (%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, (%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ je L(eq)
+#endif
+
+ movzbl 1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 1(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ je L(eq)
+#endif
+ movzbl 2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 2(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ je L(eq)
+#endif
+ movzbl 3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 3(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ je L(eq)
+#endif
+ movzbl 4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 4(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ je L(eq)
+#endif
+ movzbl 5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 5(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ je L(eq)
+#endif
+ movzbl 6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 6(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ je L(eq)
+#endif
+ movzbl 7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 7(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $8, REM
+ je L(eq)
+#endif
+ add $8, %eax
+ add $8, %edx
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ PUSH (%edi)
+#endif
+ PUSH (%esi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_remember_state
+#endif
+ mov %edx, %edi
+ mov %eax, %esi
+ xorl %eax, %eax
+L(check_offset):
+ movl %edi, %edx
+ movl %esi, %ecx
+ andl $0xfff, %edx
+ andl $0xfff, %ecx
+ cmpl %edx, %ecx
+ cmovl %edx, %ecx
+ lea -0xff0(%ecx), %edx
+ sub %edx, %edi
+ sub %edx, %esi
+ testl %edx, %edx
+ jg L(crosspage)
+L(loop):
+ movdqu (%esi,%edx), %xmm2
+ movdqu (%edi,%edx), %xmm1
+ TOLOWER (%xmm2, %xmm1)
+ pcmpistri $0x1a, %xmm2, %xmm1
+ jbe L(end)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, REM
+ jbe L(more16byteseq)
+#endif
+
+ add $16, %edx
+ jle L(loop)
+L(crosspage):
+ movzbl (%edi,%edx), %eax
+ movzbl (%esi,%edx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+ subl %ecx, %eax
+ jne L(ret)
+ testl %ecx, %ecx
+ je L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $1, REM
+ jbe L(more16byteseq)
+#endif
+ inc %edx
+ cmp $15, %edx
+ jle L(crosspage)
+ add %edx, %edi
+ add %edx, %esi
+ jmp L(check_offset)
+
+ .p2align 4
+L(end):
+ jnc L(ret)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %ecx, REM
+ jbe L(more16byteseq)
+#endif
+ lea (%ecx,%edx), %ecx
+ movzbl (%edi,%ecx), %eax
+ movzbl (%esi,%ecx), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+# endif
+#endif
+ subl %ecx, %eax
+L(ret):
+ POP (%esi)
+ POP (%edi)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ ret
+
+ .p2align 4
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_restore_state
+L(more16byteseq):
+ POP (%esi)
+# ifdef USE_AS_STRNCMP
+ POP (%edi)
+# endif
+#endif
+L(eq):
+ xorl %eax, %eax
+ RETURN
+
+L(neq):
+ mov $1, %eax
+ ja L(neq_bigger)
+ neg %eax
+L(neq_bigger):
+ RETURN
+
+L(less16bytes):
+ add $0xfefefeff, %ecx
+ jnc L(less4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movd %xmm3, %edi
+ xor %edi, %ecx
+#else
+ xor (%edx), %ecx
+#endif
+ or $0xfefefeff, %ecx
+ add $1, %ecx
+ jnz L(less4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(eq)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ psrldq $4, %xmm3
+ psrldq $4, %xmm4
+ movd %xmm3, %ecx
+ movd %xmm4, %edi
+ cmp %edi, %ecx
+ mov %ecx, %edi
+#else
+ mov 4(%edx), %ecx
+ cmp 4(%eax), %ecx
+#endif
+ jne L(more4bytes)
+ add $0xfefefeff, %ecx
+ jnc L(more4bytes)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ xor %edi, %ecx
+#else
+ xor 4(%edx), %ecx
+#endif
+ or $0xfefefeff, %ecx
+ add $1, %ecx
+ jnz L(more4bytes)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $8, REM
+ jbe L(eq)
+#endif
+
+ add $8, %edx
+ add $8, %eax
+L(less4bytes):
+
+ movzbl (%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl (%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, (%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ je L(eq)
+#endif
+ movzbl 1(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 1(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ je L(eq)
+#endif
+
+ movzbl 2(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 2(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ je L(eq)
+#endif
+ movzbl 3(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 3(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+L(more4bytes):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ je L(eq)
+#endif
+ movzbl 4(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 4(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ je L(eq)
+#endif
+ movzbl 5(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 5(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ je L(eq)
+#endif
+ movzbl 6(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 6(%edx)
+#endif
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ je L(eq)
+#endif
+ movzbl 7(%eax), %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %edi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi
+# endif
+ cmpl %ecx, %edi
+#else
+ cmpb %cl, 7(%edx)
+#endif
+ jne L(neq)
+ jmp L(eq)
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..b25cc3e068
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -0,0 +1,2810 @@
+/* strcmp with SSSE3
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_STRNCMP
+# ifndef STRCMP
+# define STRCMP __strncmp_ssse3
+# endif
+# define STR1 8
+# define STR2 STR1+4
+# define CNT STR2+4
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ mov $16, %esi; \
+ sub %ecx, %esi; \
+ cmp %esi, REM; \
+ jbe L(more8byteseq); \
+ sub %esi, REM
+# define FLAGS %ebx
+# define REM %ebp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strcasecmp_l_ssse3
+# endif
+# ifdef PIC
+# define STR1 8
+# else
+# define STR1 4
+# endif
+# define STR2 STR1+4
+# define LOCALE 12 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx)
+# else
+# define RETURN ret; .p2align 4
+# endif
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS (%esp)
+# define NONASCII __strcasecmp_nonascii
+#elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# ifndef STRCMP
+# define STRCMP __strncasecmp_l_ssse3
+# endif
+# ifdef PIC
+# define STR1 12
+# else
+# define STR1 8
+# endif
+# define STR2 STR1+4
+# define CNT STR2+4
+# define LOCALE 16 /* Loaded before the adjustment. */
+# ifdef PIC
+# define RETURN POP (REM); POP (%ebx); ret; \
+ .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM)
+# else
+# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM)
+# endif
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ mov $16, %esi; \
+ sub %ecx, %esi; \
+ cmp %esi, REM; \
+ jbe L(more8byteseq); \
+ sub %esi, REM
+# define FLAGS (%esp)
+# define REM %ebp
+# define NONASCII __strncasecmp_nonascii
+#else
+# ifndef STRCMP
+# define STRCMP __strcmp_ssse3
+# endif
+# define STR1 4
+# define STR2 STR1+4
+# define RETURN ret; .p2align 4
+# define UPDATE_STRNCMP_COUNTER
+# define FLAGS %ebx
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+
+#ifdef USE_AS_STRCASECMP_L
+ENTRY (__strcasecmp_ssse3)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strcasecmp_nonascii
+# else
+ jne __strcasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strcasecmp_ssse3)
+#endif
+
+#ifdef USE_AS_STRNCASECMP_L
+ENTRY (__strncasecmp_ssse3)
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+ movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ addl %gs:0, %eax
+ movl (%eax), %eax
+# else
+ movl %gs:(%eax), %eax
+# endif
+# else
+# ifdef NO_TLS_DIRECT_SEG_REFS
+ movl %gs:0, %eax
+ movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax
+# else
+ movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax
+# endif
+# endif
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+# ifdef PIC
+ je L(ascii)
+ POP (%ebx)
+ jmp __strncasecmp_nonascii
+# else
+ jne __strncasecmp_nonascii
+ jmp L(ascii)
+# endif
+END (__strncasecmp_ssse3)
+#endif
+
+ENTRY (STRCMP)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movl LOCALE(%esp), %eax
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax
+# else
+ movl (%eax), %eax
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax)
+ jne NONASCII
+
+# ifdef PIC
+ PUSH (%ebx)
+ LOAD_PIC_REG(bx)
+# endif
+L(ascii):
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Lbelowupper:
+ .quad 0x4040404040404040
+ .quad 0x4040404040404040
+.Ltopupper:
+ .quad 0x5b5b5b5b5b5b5b5b
+ .quad 0x5b5b5b5b5b5b5b5b
+.Ltouppermask:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+# ifdef PIC
+# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx)
+# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx)
+# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx)
+# else
+# define UCLOW_reg .Lbelowupper
+# define UCHIGH_reg .Ltopupper
+# define LCQWORD_reg .Ltouppermask
+# endif
+#endif
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ PUSH (REM)
+#endif
+
+ movl STR1(%esp), %edx
+ movl STR2(%esp), %eax
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ movl CNT(%esp), REM
+ cmp $16, REM
+ jb L(less16bytes_sncmp)
+#elif !defined USE_AS_STRCASECMP_L
+ movzbl (%eax), %ecx
+ cmpb %cl, (%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 1(%eax), %ecx
+ cmpb %cl, 1(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 2(%eax), %ecx
+ cmpb %cl, 2(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 3(%eax), %ecx
+ cmpb %cl, 3(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 4(%eax), %ecx
+ cmpb %cl, 4(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 5(%eax), %ecx
+ cmpb %cl, 5(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 6(%eax), %ecx
+ cmpb %cl, 6(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ movzbl 7(%eax), %ecx
+ cmpb %cl, 7(%edx)
+ jne L(neq)
+ cmpl $0, %ecx
+ je L(eq)
+
+ add $8, %edx
+ add $8, %eax
+#endif
+ movl %edx, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(crosspage)
+ mov %eax, %ecx
+ and $0xfff, %ecx
+ cmp $0xff0, %ecx
+ ja L(crosspage)
+ pxor %xmm0, %xmm0
+ movlpd (%eax), %xmm1
+ movlpd (%edx), %xmm2
+ movhpd 8(%eax), %xmm1
+ movhpd 8(%edx), %xmm2
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa reg1, %xmm5; \
+ movdqa reg2, %xmm7; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb UCLOW_reg, %xmm5; \
+ pcmpgtb UCLOW_reg, %xmm7; \
+ pcmpgtb reg1, %xmm6; \
+ pand %xmm6, %xmm5; \
+ movdqa UCHIGH_reg, %xmm6; \
+ pcmpgtb reg2, %xmm6; \
+ pand %xmm6, %xmm7; \
+ pand LCQWORD_reg, %xmm5; \
+ por %xmm5, reg1; \
+ pand LCQWORD_reg, %xmm7; \
+ por %xmm7, reg2
+ TOLOWER (%xmm1, %xmm2)
+#else
+# define TOLOWER(reg1, reg2)
+#endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %ecx
+ sub $0xffff, %ecx
+ jnz L(less16bytes)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(eq)
+#endif
+ add $16, %eax
+ add $16, %edx
+
+L(crosspage):
+
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ PUSH (FLAGS)
+#endif
+ PUSH (%edi)
+ PUSH (%esi)
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cfi_remember_state
+#endif
+
+ movl %edx, %edi
+ movl %eax, %ecx
+ and $0xf, %ecx
+ and $0xf, %edi
+ xor %ecx, %eax
+ xor %edi, %edx
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ xor FLAGS, FLAGS
+#endif
+ cmp %edi, %ecx
+ je L(ashr_0)
+ ja L(bigger)
+ orl $0x20, FLAGS
+ xchg %edx, %eax
+ xchg %ecx, %edi
+L(bigger):
+ lea 15(%edi), %edi
+ sub %ecx, %edi
+ cmp $8, %edi
+ jle L(ashr_less_8)
+ cmp $14, %edi
+ je L(ashr_15)
+ cmp $13, %edi
+ je L(ashr_14)
+ cmp $12, %edi
+ je L(ashr_13)
+ cmp $11, %edi
+ je L(ashr_12)
+ cmp $10, %edi
+ je L(ashr_11)
+ cmp $9, %edi
+ je L(ashr_10)
+L(ashr_less_8):
+ je L(ashr_9)
+ cmp $7, %edi
+ je L(ashr_8)
+ cmp $6, %edi
+ je L(ashr_7)
+ cmp $5, %edi
+ je L(ashr_6)
+ cmp $4, %edi
+ je L(ashr_5)
+ cmp $3, %edi
+ je L(ashr_4)
+ cmp $2, %edi
+ je L(ashr_3)
+ cmp $1, %edi
+ je L(ashr_2)
+ cmp $0, %edi
+ je L(ashr_1)
+
+/*
+ * The following cases will be handled by ashr_0
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+L(ashr_0):
+ mov $0xffff, %esi
+ movdqa (%eax), %xmm1
+ pxor %xmm0, %xmm0
+ pcmpeqb %xmm1, %xmm0
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movdqa (%edx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1
+#else
+ pcmpeqb (%edx), %xmm1
+#endif
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ mov %ecx, %edi
+ jne L(less32bytes)
+ UPDATE_STRNCMP_COUNTER
+ movl $0x10, FLAGS
+ mov $0x10, %ecx
+ pxor %xmm0, %xmm0
+ .p2align 4
+L(loop_ashr_0):
+ movdqa (%eax, %ecx), %xmm1
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ movdqa (%edx, %ecx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+#else
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb (%edx, %ecx), %xmm1
+#endif
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ jmp L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+L(ashr_1):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $15, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -15(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $1, FLAGS
+ lea 1(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_1):
+ add $16, %edi
+ jg L(nibble_ashr_1)
+
+L(gobble_ashr_1):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $1, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_1)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $1, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_1)
+
+ .p2align 4
+L(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfffe, %esi
+ jnz L(ashr_1_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $15, REM
+ jbe L(ashr_1_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_1)
+
+ .p2align 4
+L(ashr_1_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+L(ashr_2):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -14(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $2, FLAGS
+ lea 2(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_2):
+ add $16, %edi
+ jg L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_2)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_2)
+
+ .p2align 4
+L(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfffc, %esi
+ jnz L(ashr_2_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $14, REM
+ jbe L(ashr_2_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_2)
+
+ .p2align 4
+L(ashr_2_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+L(ashr_3):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -13(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $3, FLAGS
+ lea 3(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_3):
+ add $16, %edi
+ jg L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_3)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_3)
+
+ .p2align 4
+L(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfff8, %esi
+ jnz L(ashr_3_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $13, REM
+ jbe L(ashr_3_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_3)
+
+ .p2align 4
+L(ashr_3_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+L(ashr_4):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -12(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $4, FLAGS
+ lea 4(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_4):
+ add $16, %edi
+ jg L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_4)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_4)
+
+ .p2align 4
+L(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfff0, %esi
+ jnz L(ashr_4_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $12, REM
+ jbe L(ashr_4_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_4)
+
+ .p2align 4
+L(ashr_4_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(11~15) n -11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+L(ashr_5):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -11(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $5, FLAGS
+ lea 5(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_5):
+ add $16, %edi
+ jg L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_5)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_5)
+
+ .p2align 4
+L(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xffe0, %esi
+ jnz L(ashr_5_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $11, REM
+ jbe L(ashr_5_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_5)
+
+ .p2align 4
+L(ashr_5_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(10~15) n -10 5(15 +(n-10) - n) ashr_6
+ */
+
+ .p2align 4
+L(ashr_6):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -10(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $6, FLAGS
+ lea 6(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_6):
+ add $16, %edi
+ jg L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_6)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_6)
+
+ .p2align 4
+L(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xffc0, %esi
+ jnz L(ashr_6_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $10, REM
+ jbe L(ashr_6_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_6)
+
+ .p2align 4
+L(ashr_6_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7
+ */
+
+ .p2align 4
+L(ashr_7):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -9(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $7, FLAGS
+ lea 8(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_7):
+ add $16, %edi
+ jg L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_7)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_7)
+
+ .p2align 4
+L(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xff80, %esi
+ jnz L(ashr_7_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $9, REM
+ jbe L(ashr_7_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_7)
+
+ .p2align 4
+L(ashr_7_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8
+ */
+ .p2align 4
+L(ashr_8):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -8(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $8, FLAGS
+ lea 8(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_8):
+ add $16, %edi
+ jg L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_8)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_8)
+
+ .p2align 4
+L(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xff00, %esi
+ jnz L(ashr_8_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $8, REM
+ jbe L(ashr_8_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_8)
+
+ .p2align 4
+L(ashr_8_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9
+ */
+ .p2align 4
+L(ashr_9):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -7(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $9, FLAGS
+ lea 9(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_9):
+ add $16, %edi
+ jg L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_9)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_9)
+
+ .p2align 4
+L(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfe00, %esi
+ jnz L(ashr_9_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(ashr_9_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_9)
+
+ .p2align 4
+L(ashr_9_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10
+ */
+ .p2align 4
+L(ashr_10):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -6(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $10, FLAGS
+ lea 10(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_10):
+ add $16, %edi
+ jg L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_10)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_10)
+
+ .p2align 4
+L(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xfc00, %esi
+ jnz L(ashr_10_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ jbe L(ashr_10_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_10)
+
+ .p2align 4
+L(ashr_10_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11
+ */
+ .p2align 4
+L(ashr_11):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -5(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $11, FLAGS
+ lea 11(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_11):
+ add $16, %edi
+ jg L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_11)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_11)
+
+ .p2align 4
+L(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xf800, %esi
+ jnz L(ashr_11_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ jbe L(ashr_11_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_11)
+
+ .p2align 4
+L(ashr_11_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12
+ */
+ .p2align 4
+L(ashr_12):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -4(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $12, FLAGS
+ lea 12(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_12):
+ add $16, %edi
+ jg L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_12)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_12)
+
+ .p2align 4
+L(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xf000, %esi
+ jnz L(ashr_12_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(ashr_12_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_12)
+
+ .p2align 4
+L(ashr_12_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13
+ */
+ .p2align 4
+L(ashr_13):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -3(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $13, FLAGS
+ lea 13(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_13):
+ add $16, %edi
+ jg L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_13)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_13)
+
+ .p2align 4
+L(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xe000, %esi
+ jnz L(ashr_13_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ jbe L(ashr_13_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_13)
+
+ .p2align 4
+L(ashr_13_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14
+ */
+ .p2align 4
+L(ashr_14):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -2(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $14, FLAGS
+ lea 14(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_14):
+ add $16, %edi
+ jg L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_14)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_14)
+
+ .p2align 4
+L(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0xc000, %esi
+ jnz L(ashr_14_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ jbe L(ashr_14_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_14)
+
+ .p2align 4
+L(ashr_14_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * ecx(offset of esi) eax(offset of edi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15
+ */
+
+ .p2align 4
+L(ashr_15):
+ mov $0xffff, %esi
+ pxor %xmm0, %xmm0
+ movdqa (%edx), %xmm2
+ movdqa (%eax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %edi
+ shr %cl, %esi
+ shr %cl, %edi
+ sub %edi, %esi
+ lea -1(%ecx), %edi
+ jnz L(less32bytes)
+
+ UPDATE_STRNCMP_COUNTER
+
+ movdqa (%edx), %xmm3
+ pxor %xmm0, %xmm0
+ mov $16, %ecx
+ orl $15, FLAGS
+ lea 15(%edx), %edi
+ and $0xfff, %edi
+ sub $0x1000, %edi
+
+ .p2align 4
+L(loop_ashr_15):
+ add $16, %edi
+ jg L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+
+ add $16, %edi
+ jg L(nibble_ashr_15)
+
+ movdqa (%eax, %ecx), %xmm1
+ movdqa (%edx, %ecx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ sub $0xffff, %esi
+ jnz L(exit)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $16, REM
+ lea -16(REM), REM
+ jbe L(more8byteseq)
+#endif
+ add $16, %ecx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_15)
+
+ .p2align 4
+L(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %esi
+ test $0x8000, %esi
+ jnz L(ashr_15_exittail)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ jbe L(ashr_15_exittail)
+#endif
+ pxor %xmm0, %xmm0
+ sub $0x1000, %edi
+ jmp L(gobble_ashr_15)
+
+ .p2align 4
+L(ashr_15_exittail):
+ movdqa (%eax, %ecx), %xmm1
+ psrldq $15, %xmm0
+ psrldq $15, %xmm3
+ jmp L(aftertail)
+
+ .p2align 4
+L(aftertail):
+ TOLOWER (%xmm1, %xmm3)
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ not %esi
+L(exit):
+ mov FLAGS, %edi
+ and $0x1f, %edi
+ lea -16(%edi, %ecx), %edi
+L(less32bytes):
+ add %edi, %edx
+ add %ecx, %eax
+ testl $0x20, FLAGS
+ jz L(ret2)
+ xchg %eax, %edx
+
+ .p2align 4
+L(ret2):
+ mov %esi, %ecx
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+#endif
+ POP (%esi)
+ POP (%edi)
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ POP (FLAGS)
+#endif
+L(less16bytes):
+ test %cl, %cl
+ jz L(2next_8_bytes)
+
+ test $0x01, %cl
+ jnz L(Byte0)
+
+ test $0x02, %cl
+ jnz L(Byte1)
+
+ test $0x04, %cl
+ jnz L(Byte2)
+
+ test $0x08, %cl
+ jnz L(Byte3)
+
+ test $0x10, %cl
+ jnz L(Byte4)
+
+ test $0x20, %cl
+ jnz L(Byte5)
+
+ test $0x40, %cl
+ jnz L(Byte6)
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(eq)
+#endif
+
+ movzx 7(%eax), %ecx
+ movzx 7(%edx), %eax
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte0):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $0, REM
+ jbe L(eq)
+#endif
+ movzx (%eax), %ecx
+ movzx (%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte1):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $1, REM
+ jbe L(eq)
+#endif
+ movzx 1(%eax), %ecx
+ movzx 1(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte2):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, REM
+ jbe L(eq)
+#endif
+ movzx 2(%eax), %ecx
+ movzx 2(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte3):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, REM
+ jbe L(eq)
+#endif
+ movzx 3(%eax), %ecx
+ movzx 3(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte4):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, REM
+ jbe L(eq)
+#endif
+ movzx 4(%eax), %ecx
+ movzx 4(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte5):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, REM
+ jbe L(eq)
+#endif
+ movzx 5(%eax), %ecx
+ movzx 5(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(Byte6):
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, REM
+ jbe L(eq)
+#endif
+ movzx 6(%eax), %ecx
+ movzx 6(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+L(2next_8_bytes):
+ add $8, %eax
+ add $8, %edx
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $8, REM
+ lea -8(REM), REM
+ jbe L(eq)
+#endif
+
+ test $0x01, %ch
+ jnz L(Byte0)
+
+ test $0x02, %ch
+ jnz L(Byte1)
+
+ test $0x04, %ch
+ jnz L(Byte2)
+
+ test $0x08, %ch
+ jnz L(Byte3)
+
+ test $0x10, %ch
+ jnz L(Byte4)
+
+ test $0x20, %ch
+ jnz L(Byte5)
+
+ test $0x40, %ch
+ jnz L(Byte6)
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, REM
+ jbe L(eq)
+#endif
+ movzx 7(%eax), %ecx
+ movzx 7(%edx), %eax
+
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax
+# endif
+#endif
+
+ sub %ecx, %eax
+ RETURN
+
+#ifdef USE_AS_STRNCMP
+L(neq_sncmp):
+#endif
+L(neq):
+ mov $1, %eax
+ ja L(neq_bigger)
+ neg %eax
+L(neq_bigger):
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+#endif
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ .p2align 4
+ cfi_restore_state
+L(more8byteseq):
+
+# ifdef USE_AS_STRNCASECMP_L
+ addl $4, %esp
+ cfi_adjust_cfa_offset (-4)
+# endif
+ POP (%esi)
+ POP (%edi)
+# ifdef USE_AS_STRNCMP
+ POP (FLAGS)
+# endif
+#endif
+
+#ifdef USE_AS_STRNCMP
+L(eq_sncmp):
+#endif
+L(eq):
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ POP (REM)
+#endif
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# ifdef PIC
+ POP (%ebx)
+# endif
+#endif
+ xorl %eax, %eax
+ ret
+
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ .p2align 4
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+ CFI_PUSH (%ebx)
+# endif
+ CFI_PUSH (REM)
+L(less16bytes_sncmp):
+# ifdef USE_AS_STRNCASECMP_L
+ PUSH (%esi)
+# endif
+ test REM, REM
+ jz L(eq_sncmp)
+
+ movzbl (%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl (%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, (%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $1, REM
+ je L(eq_sncmp)
+
+ movzbl 1(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 1(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 1(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $2, REM
+ je L(eq_sncmp)
+
+ movzbl 2(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 2(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 2(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $3, REM
+ je L(eq_sncmp)
+
+ movzbl 3(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 3(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 3(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $4, REM
+ je L(eq_sncmp)
+
+ movzbl 4(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 4(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 4(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $5, REM
+ je L(eq_sncmp)
+
+ movzbl 5(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 5(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 5(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $6, REM
+ je L(eq_sncmp)
+
+ movzbl 6(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 6(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 6(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $7, REM
+ je L(eq_sncmp)
+
+ movzbl 7(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 7(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 7(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+
+ cmp $8, REM
+ je L(eq_sncmp)
+
+ movzbl 8(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 8(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 8(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $9, REM
+ je L(eq_sncmp)
+
+ movzbl 9(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 9(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 9(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $10, REM
+ je L(eq_sncmp)
+
+ movzbl 10(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 10(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 10(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $11, REM
+ je L(eq_sncmp)
+
+ movzbl 11(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 11(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 11(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+
+ cmp $12, REM
+ je L(eq_sncmp)
+
+ movzbl 12(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 12(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 12(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $13, REM
+ je L(eq_sncmp)
+
+ movzbl 13(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 13(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 13(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $14, REM
+ je L(eq_sncmp)
+
+ movzbl 14(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 14(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 14(%edx)
+# endif
+ jne L(neq_sncmp)
+ test %cl, %cl
+ je L(eq_sncmp)
+
+ cmp $15, REM
+ je L(eq_sncmp)
+
+ movzbl 15(%eax), %ecx
+# ifdef USE_AS_STRNCASECMP_L
+ movzbl 15(%edx), %esi
+# ifdef PIC
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi
+# else
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx
+ movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi
+# endif
+ cmpl %ecx, %esi
+# else
+ cmpb %cl, 15(%edx)
+# endif
+ jne L(neq_sncmp)
+
+# ifdef USE_AS_STRNCASECMP_L
+L(eq_sncmp):
+ POP (%esi)
+# endif
+ POP (REM)
+# if defined USE_AS_STRNCASECMP_L && defined PIC
+ POP (%ebx)
+# endif
+ xor %eax, %eax
+ ret
+
+# ifdef USE_AS_STRNCASECMP_L
+ .p2align 4
+# ifdef PIC
+ CFI_PUSH (%ebx)
+# endif
+ CFI_PUSH (REM)
+ CFI_PUSH (%esi)
+L(neq_sncmp):
+ mov $1, %eax
+ mov $-1, %edx
+ cmovna %edx, %eax
+ POP (%esi)
+ POP (REM)
+# ifdef PIC
+ POP (%ebx)
+# endif
+ ret
+# endif
+#endif
+
+END (STRCMP)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
new file mode 100644
index 0000000000..56de25a4b7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRNCMP
+# define STRCMP strncmp
+# define __GI_STRCMP __GI_strncmp
+# define __STRCMP_IA32 __strncmp_ia32
+# define __STRCMP_SSSE3 __strncmp_ssse3
+# define __STRCMP_SSE4_2 __strncmp_sse4_2
+#elif defined USE_AS_STRCASECMP_L
+# define STRCMP __strcasecmp_l
+# define __GI_STRCMP __GI_strcasecmp_l
+# define __STRCMP_IA32 __strcasecmp_l_ia32
+# define __STRCMP_SSSE3 __strcasecmp_l_ssse3
+# define __STRCMP_SSE4_2 __strcasecmp_l_sse4_2
+#elif defined USE_AS_STRNCASECMP_L
+# define STRCMP __strncasecmp_l
+# define __GI_STRCMP __GI_strncasecmp_l
+# define __STRCMP_IA32 __strncasecmp_l_ia32
+# define __STRCMP_SSSE3 __strncasecmp_l_ssse3
+# define __STRCMP_SSE4_2 __strncasecmp_l_sse4_2
+#else
+# define STRCMP strcmp
+# define __GI_STRCMP __GI_strcmp
+# define __STRCMP_IA32 __strcmp_ia32
+# define __STRCMP_SSSE3 __strcmp_ssse3
+# define __STRCMP_SSE4_2 __strcmp_sse4_2
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncmp in static library since we
+ need strncmp before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+ .text
+ENTRY(STRCMP)
+ .type STRCMP, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
+2: ret
+END(STRCMP)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __STRCMP_IA32, @function; \
+ .p2align 4; \
+ .globl __STRCMP_IA32; \
+ .hidden __STRCMP_IA32; \
+ __STRCMP_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32
+# endif
+#endif
+
+#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \
+ && !defined USE_AS_STRNCASECMP_L
+# include "../strcmp.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..ed627a5f62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2250 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \
+ CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets.
+ INDEX is a register contains the index into the jump table.
+ SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ SETUP_PIC_REG(cx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjusted ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edi
+ mov STR2(%esp), %esi
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+
+ mov %esi, %ecx
+# ifndef USE_AS_STPCPY
+ mov %edi, %eax /* save result */
+# endif
+ and $15, %ecx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%esi), %xmm1
+ add %ecx, %ebx
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%edi)
+
+ sub %ecx, %edi
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movdqu %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+ lea 128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+ add $64, %edi
+ add $64, %esi
+ movdqu %xmm4, -64(%edi)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%edi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+ movdqu %xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+# endif
+ movdqu %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%esi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%esi), %xmm0
+ movdqu %xmm1, (%edi)
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ jmp L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %edi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ sub %ecx, %ebx
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ movdqu %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+# endif
+ movdqu %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+# endif
+ movdqu %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %edi
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+ mov %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb %dh, (%edi)
+# ifdef USE_AS_STPCPY
+ lea (%edi), %eax
+# endif
+ sub $1, %ebx
+ lea 1(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ sub $2, %ebx
+ lea 2(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%edi)
+ movb %dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ sub $3, %ebx
+ lea 3(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ sub $4, %ebx
+ lea 4(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%esi), %ecx
+ movb %dh, 4(%edi)
+ movl %ecx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ sub $5, %ebx
+ lea 5(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ sub $6, %ebx
+ lea 6(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ sub $7, %ebx
+ lea 7(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ sub $8, %ebx
+ lea 8(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movlpd (%esi), %xmm0
+ movb %dh, 8(%edi)
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ sub $9, %ebx
+ lea 9(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ sub $10, %ebx
+ lea 10(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ sub $11, %ebx
+ lea 11(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ sub $12, %ebx
+ lea 12(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ sub $13, %ebx
+ lea 13(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ sub $14, %ebx
+ lea 14(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ sub $15, %ebx
+ lea 15(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ sub $16, %ebx
+ lea 16(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+ movb %dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ sub $17, %ebx
+ lea 17(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ sub $18, %ebx
+ lea 18(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ sub $19, %ebx
+ lea 19(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ sub $20, %ebx
+ lea 20(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ sub $21, %ebx
+ lea 21(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ sub $22, %ebx
+ lea 22(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ sub $23, %ebx
+ lea 23(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ sub $24, %ebx
+ lea 24(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ sub $25, %ebx
+ lea 25(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ sub $26, %ebx
+ lea 26(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ sub $27, %ebx
+ lea 27(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ sub $28, %ebx
+ lea 28(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ sub $29, %ebx
+ lea 29(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ sub $30, %ebx
+ lea 30(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+
+ .p2align 4
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ sub $31, %ebx
+ lea 31(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ sub $32, %ebx
+ lea 32(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(StrncpyExit1):
+ movb (%esi), %dl
+ movb %dl, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit3):
+ movw (%esi), %cx
+ movb 2(%esi), %dl
+ movw %cx, (%edi)
+ movb %dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ movl (%esi), %ecx
+ movb 4(%esi), %dl
+ movl %ecx, (%edi)
+ movb %dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ movlpd (%esi), %xmm0
+ movb 8(%esi), %dl
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%esi), %xmm0
+ movb 16(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movb %cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movb 20(%esi), %dl
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movb 24(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movb 32(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+ movb %cl, 32(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movl %edx, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%edi)
+ movb %dl, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%edi)
+ movw %dx, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movlpd %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 5(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 6(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%edi, %ecx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %edx, %edx
+ add $15, %ebx
+ add %ecx, %edi
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%edi)
+ add $16, %edi
+
+ mov %edi, %esi
+ and $0xf, %esi
+ sub %esi, %edi
+ add %esi, %ebx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ movdqa %xmm0, 32(%edi)
+ movdqa %xmm0, 48(%edi)
+ add $64, %edi
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ add $32, %edi
+ sub $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+ add $16, %ebx
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+ lea 64(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%edi)
+ lea 16(%edi, %ecx), %edi
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(ExitZero):
+ movl %edi, %eax
+ RETURN
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+ .int JMPTBL(L(Exit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define RETURN1 ret
+
+ .text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ PUSH (%ebx)
+
+ mov %edx, %edi
+ lea 16(%ecx), %ebx
+ and $-16, %ebx
+ pxor %xmm0, %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+ pcmpeqb (%ebx), %xmm0
+ pmovmskb %xmm0, %eax
+ sub %ecx, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %ecx, %eax
+ lea 16(%ecx), %ecx
+ and $-16, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+ xor %ebx, %ebx
+
+ .p2align 4
+ movdqa (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movdqu %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm4
+ movdqu %xmm3, (%edx, %ebx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm1
+ movdqu %xmm4, (%edx, %ebx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm2
+ movdqu %xmm1, (%edx, %ebx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm3, (%edx, %ebx)
+ mov %ecx, %eax
+ lea 16(%ecx, %ebx), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps 32(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ add $64, %ecx
+ pminub %xmm7, %xmm3
+ add $64, %edx
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+L(Aligned64Loop_start):
+ movdqu %xmm4, -64(%edx)
+ movaps (%ecx), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edx)
+ movaps 16(%ecx), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%ecx), %xmm3
+ movdqu %xmm6, -32(%edx)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edx)
+ movaps 48(%ecx), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $64, %edx
+ add $64, %ecx
+ test %eax, %eax
+ jz L(Aligned64Loop_start)
+L(Aligned64Leave):
+ sub $0xa0, %ebx
+ pxor %xmm0, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ebx, %edx
+ add %ebx, %ecx
+
+ POP (%ebx)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ /* Exit 8 */
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ /* Exit 16 */
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edx, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail8):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000000..effd85da94
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3901 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 8
+# define ENTRANCE PUSH (%ebx)
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx);
+# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN ret
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+
+# ifdef USE_AS_STPCPY
+# define SAVE_RESULT(n) lea n(%edx), %eax
+# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax
+# else
+# define SAVE_RESULT(n) movl %edi, %eax
+# define SAVE_RESULT_TAIL(n) movl %edx, %eax
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+/* In this code following instructions are used for copying:
+ movb - 1 byte
+ movw - 2 byte
+ movl - 4 byte
+ movlpd - 8 byte
+ movaps - 16 byte - requires 16 byte alignment
+ of sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ je L(ExitTail16)
+# endif
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+# endif
+ PUSH (%esi)
+# ifdef USE_AS_STRNCPY
+ mov %ecx, %esi
+ sub $16, %ebx
+ and $0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+ add %esi, %ebx
+# endif
+ lea 16(%ecx), %esi
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ movlpd (%ecx), %xmm1
+ movlpd %xmm1, (%edx)
+
+ pcmpeqb (%esi), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+# ifdef USE_AS_STRNCPY
+ add %eax, %esi
+ lea -1(%esi), %esi
+ and $1<<31, %esi
+ test %esi, %esi
+ jnz L(ContinueCopy)
+ lea 16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+ jz L(Align16Both)
+
+ cmp $8, %eax
+ jae L(ShlHigh8)
+ cmp $1, %eax
+ je L(Shl1)
+ cmp $2, %eax
+ je L(Shl2)
+ cmp $3, %eax
+ je L(Shl3)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $5, %eax
+ je L(Shl5)
+ cmp $6, %eax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %eax
+ je L(Shl9)
+ cmp $10, %eax
+ je L(Shl10)
+ cmp $11, %eax
+ je L(Shl11)
+ cmp $12, %eax
+ je L(Shl12)
+ cmp $13, %eax
+ je L(Shl13)
+ cmp $14, %eax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ lea 112(%ebx, %eax), %ebx
+# endif
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqb %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%ebx), %ebx
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%ecx), %xmm1
+ movaps 15(%ecx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 31(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -15(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%ecx), %xmm2
+ movaps 31(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ mov $15, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%ecx), %xmm1
+ movaps 14(%ecx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 30(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -14(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%ecx), %xmm2
+ movaps 30(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%ecx), %xmm1
+ movaps 13(%ecx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 29(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -13(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%ecx), %xmm2
+ movaps 29(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%ecx), %xmm1
+ movaps 11(%ecx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 27(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -11(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%ecx), %xmm2
+ movaps 27(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%ecx), %xmm1
+ movaps 10(%ecx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 26(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -10(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%ecx), %xmm2
+ movaps 26(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%ecx), %xmm1
+ movaps 9(%ecx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 25(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -9(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%ecx), %xmm2
+ movaps 25(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%ecx), %xmm1
+ movaps 7(%ecx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 23(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -7(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%ecx), %xmm2
+ movaps 23(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $7, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%ecx), %xmm1
+ movaps 6(%ecx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 22(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -6(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%ecx), %xmm2
+ movaps 22(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $6, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%ecx), %xmm1
+ movaps 5(%ecx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 21(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -5(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%ecx), %xmm2
+ movaps 21(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movlpd -3(%ecx), %xmm0
+ movlpd %xmm0, -3(%edx)
+ mov $5, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%ecx), %xmm1
+ movaps 3(%ecx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 19(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -3(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%ecx), %xmm2
+ movaps 19(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%ecx), %xmm1
+ movaps 2(%ecx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 18(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -2(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%ecx), %xmm2
+ movaps 18(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%ecx), %xmm1
+ movaps 1(%ecx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 17(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -1(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%ecx), %xmm2
+ movaps 17(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+# ifdef USE_AS_STRCAT
+ jmp L(CopyFrom1To16Bytes)
+# endif
+
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %ebx
+# endif
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ add %esi, %edx
+
+ POP (%esi)
+
+ test %al, %al
+ jz L(ExitHighCase2)
+
+ cmp $8, %ebx
+ ja L(CopyFrom1To16BytesLess8)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $8, %ebx
+ jbe L(CopyFrom1To16BytesLess8Case3)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(Exit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(Exit15)
+ jmp L(Exit16)
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+ cmp $4, %ebx
+ ja L(ExitHigh4Case3)
+
+ cmp $1, %ebx
+ je L(Exit1)
+ cmp $2, %ebx
+ je L(Exit2)
+ cmp $3, %ebx
+ je L(Exit3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (4)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4Case3):
+ cmp $5, %ebx
+ je L(Exit5)
+ cmp $6, %ebx
+ je L(Exit6)
+ cmp $7, %ebx
+ je L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (8)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8Case3):
+ cmp $12, %ebx
+ ja L(ExitHigh12Case3)
+
+ cmp $9, %ebx
+ je L(Exit9)
+ cmp $10, %ebx
+ je L(Exit10)
+ cmp $11, %ebx
+ je L(Exit11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (12)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12Case3):
+ cmp $13, %ebx
+ je L(Exit13)
+ cmp $14, %ebx
+ je L(Exit14)
+ cmp $15, %ebx
+ je L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ SAVE_RESULT (16)
+ RETURN1
+
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+CFI_POP (%edi)
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%ecx)
+ movb %dl, 2(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%ecx)
+ movb %dl, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%ecx)
+ movw %dx, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%ecx)
+ movl %edx, 3(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%ecx)
+ movb %dl, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%ecx)
+ movw %dx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 5(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 6(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+ test %ebx, %ebx
+ jz L(Fill0)
+ cmp $16, %ebx
+ je L(Fill16)
+ cmp $8, %ebx
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %ebx
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %ebx
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %ebx
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ CFI_PUSH(%edi)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ POP (%edi)
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit1)
+
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+
+ lea 16(%ecx), %ecx
+
+ mov %ecx, %edx
+ and $0xf, %edx
+ sub %edx, %ecx
+ add %edx, %ebx
+ xor %edx, %edx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ movdqa %xmm0, 32(%ecx)
+ movdqa %xmm0, 48(%ecx)
+ lea 64(%ecx), %ecx
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ lea 32(%ecx), %ecx
+ sub $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+# endif
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT_TAIL (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT_TAIL (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT_TAIL (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT_TAIL (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT_TAIL (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT_TAIL (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT_TAIL (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT_TAIL (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT_TAIL (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT_TAIL (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT_TAIL (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT_TAIL (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT_TAIL (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT_TAIL (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ CFI_PUSH (%esi)
+ CFI_PUSH (%edi)
+# endif
+ .p2align 4
+L(StrncpyLeaveCase2OrCase3):
+ test %eax, %eax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+ .p2align 4
+L(StrncpyExit1Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ mov $15, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit2Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit3Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit4Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit5Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit6Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit7Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit8Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit9Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $7, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit10Case2OrCase3):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $6, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit11Case2OrCase3):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $5, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit12Case2OrCase3):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit13Case2OrCase3):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit14Case2OrCase3):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit15Case2OrCase3):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit1):
+ lea 15(%edx, %esi), %edx
+ lea 15(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit2):
+ lea 14(%edx, %esi), %edx
+ lea 14(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit3):
+ lea 13(%edx, %esi), %edx
+ lea 13(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit4):
+ lea 12(%edx, %esi), %edx
+ lea 12(%ecx, %esi), %ecx
+ movlpd -12(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit5):
+ lea 11(%edx, %esi), %edx
+ lea 11(%ecx, %esi), %ecx
+ movlpd -11(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -11(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit6):
+ lea 10(%edx, %esi), %edx
+ lea 10(%ecx, %esi), %ecx
+
+ movlpd -10(%ecx), %xmm0
+ movw -2(%ecx), %ax
+ movlpd %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit7):
+ lea 9(%edx, %esi), %edx
+ lea 9(%ecx, %esi), %ecx
+
+ movlpd -9(%ecx), %xmm0
+ movb -1(%ecx), %ah
+ movlpd %xmm0, -9(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit8):
+ lea 8(%edx, %esi), %edx
+ lea 8(%ecx, %esi), %ecx
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit9):
+ lea 7(%edx, %esi), %edx
+ lea 7(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit10):
+ lea 6(%edx, %esi), %edx
+ lea 6(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit11):
+ lea 5(%edx, %esi), %edx
+ lea 5(%ecx, %esi), %ecx
+ movl -5(%ecx), %esi
+ movb -1(%ecx), %ah
+ movl %esi, -5(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit12):
+ lea 4(%edx, %esi), %edx
+ lea 4(%ecx, %esi), %ecx
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit13):
+ lea 3(%edx, %esi), %edx
+ lea 3(%ecx, %esi), %ecx
+
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit14):
+ lea 2(%edx, %esi), %edx
+ lea 2(%ecx, %esi), %ecx
+ movw -2(%ecx), %ax
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit15):
+ lea 1(%edx, %esi), %edx
+ lea 1(%ecx, %esi), %ecx
+ movb -1(%ecx), %ah
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+# endif
+
+# ifndef USE_AS_STRCAT
+# ifdef USE_AS_STRNCPY
+ CFI_POP (%esi)
+ CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail0):
+ movl %edx, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $12, %ebx
+ jbe L(StrncpyExit12Bytes)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmp $13, %ebx
+ je L(ExitTail13)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $14, %ebx
+ je L(ExitTail14)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12Bytes):
+ cmp $9, %ebx
+ je L(ExitTail9)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $10, %ebx
+ je L(ExitTail10)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $11, %ebx
+ je L(ExitTail11)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $4, %ebx
+ jbe L(StrncpyExit4Bytes)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+
+ cmp $5, %ebx
+ je L(ExitTail5)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $6, %ebx
+ je L(ExitTail6)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $7, %ebx
+ je L(ExitTail7)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4Bytes):
+ test %ebx, %ebx
+ jz L(ExitTail0)
+ cmp $1, %ebx
+ je L(ExitTail1)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $2, %ebx
+ je L(ExitTail2)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $3, %ebx
+ je L(ExitTail3)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+# endif
+
+END (STRCPY)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000000..ffbc03c6d5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,116 @@
+/* Multiple versions of strcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_IA32 __stpncpy_ia32
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
+# else
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_IA32 __stpcpy_ia32
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_IA32 __strncpy_ia32
+# define __GI_STRCPY __GI_strncpy
+# else
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_IA32 __strcpy_ia32
+# define __GI_STRCPY __GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncpy in static library since we
+ need strncpy before the initialization happened. */
+#if IS_IN (libc)
+
+ .text
+ENTRY(STRCPY)
+ .type STRCPY, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
+2: ret
+END(STRCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCPY_IA32, @function; \
+ .align 16; \
+ .globl STRCPY_IA32; \
+ .hidden STRCPY_IA32; \
+ STRCPY_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# include "../../stpncpy.S"
+# else
+# include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+# include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..21e5093924
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,75 @@
+/* Multiple versions of strcspn
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42 __strpbrk_sse42
+#define STRCSPN_IA32 __strpbrk_ia32
+#define __GI_STRCSPN __GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN strcspn
+#define STRCSPN_SSE42 __strcspn_sse42
+#define STRCSPN_IA32 __strcspn_ia32
+#define __GI_STRCSPN __GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strpbrk in static library since we
+ need strpbrk before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
+2: ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCSPN_IA32, @function; \
+ .globl STRCSPN_IA32; \
+ .p2align 4; \
+ STRCSPN_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
new file mode 100644
index 0000000000..d3ea864bab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -0,0 +1,125 @@
+/* strlen with SSE2 and BSF
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined SHARED && IS_IN (libc)
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+#define PARMS 4 + 8 /* Preserve ESI and EDI. */
+#define STR PARMS
+#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state
+#define RETURN POP (%edi); POP (%esi); ret; \
+ cfi_restore_state; cfi_remember_state
+
+ .text
+ENTRY ( __strlen_sse2_bsf)
+ ENTRANCE
+ mov STR(%esp), %edi
+ xor %eax, %eax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%edi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %edi, %eax
+ and $-16, %eax
+ jmp L(align16_start)
+L(next):
+
+ mov %edi, %eax
+ and $-16, %eax
+ pcmpeqb (%eax), %xmm0
+ mov $-1, %esi
+ sub %eax, %ecx
+ shl %cl, %esi
+ pmovmskb %xmm0, %edx
+ and %esi, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ .p2align 4
+L(align16_loop):
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(align16_loop)
+L(exit):
+ sub %edi, %eax
+L(exit_less16):
+ bsf %edx, %edx
+ add %edx, %eax
+ RETURN
+L(exit16):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $16, %eax
+ RETURN
+L(exit32):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $32, %eax
+ RETURN
+L(exit48):
+ sub %edi, %eax
+ bsf %edx, %edx
+ add %edx, %eax
+ add $48, %eax
+ POP (%edi)
+ POP (%esi)
+ ret
+
+END ( __strlen_sse2_bsf)
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
new file mode 100644
index 0000000000..36fc1469d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -0,0 +1,695 @@
+/* strlen with SSE2
+ Copyright (C) 2010-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+
+# include <sysdep.h>
+# define PARMS 4
+# define STR PARMS
+# define RETURN ret
+
+# ifdef USE_AS_STRNLEN
+# define LEN PARMS + 8
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+# undef RETURN
+# define RETURN POP (%edi); CFI_PUSH(%edi); ret
+# endif
+
+# ifndef STRLEN
+# define STRLEN __strlen_sse2
+# endif
+
+ atom_text_section
+ENTRY (STRLEN)
+ mov STR(%esp), %edx
+# ifdef USE_AS_STRNLEN
+ PUSH (%edi)
+ movl LEN(%esp), %edi
+ sub $4, %edi
+ jbe L(len_less4_prolog)
+# endif
+# endif
+ xor %eax, %eax
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less8_prolog)
+# endif
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less12_prolog)
+# endif
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less16_prolog)
+# endif
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+
+ pxor %xmm0, %xmm0
+ lea 16(%edx), %eax
+ mov %eax, %ecx
+ and $-16, %eax
+
+# ifdef USE_AS_STRNLEN
+ and $15, %edx
+ add %edx, %edi
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ mov %eax, %edx
+ and $63, %edx
+ add %edx, %edi
+# endif
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqb %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%eax), %eax
+ jz L(aligned_64_loop)
+
+ pcmpeqb -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+L(exit):
+ sub %ecx, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(exit_tail1)
+ test $0x04, %dl
+ jnz L(exit_tail2)
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_8):
+ test $0x10, %dl
+ jnz L(exit_tail4)
+ test $0x20, %dl
+ jnz L(exit_tail5)
+ test $0x40, %dl
+ jnz L(exit_tail6)
+ add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_high_8)
+ test $0x01, %dh
+ jnz L(exit_tail8)
+ test $0x02, %dh
+ jnz L(exit_tail9)
+ test $0x04, %dh
+ jnz L(exit_tail10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high_8):
+ test $0x10, %dh
+ jnz L(exit_tail12)
+ test $0x20, %dh
+ jnz L(exit_tail13)
+ test $0x40, %dh
+ jnz L(exit_tail14)
+ add $15, %eax
+L(exit_tail0):
+ RETURN
+
+# ifdef USE_AS_STRNLEN
+
+ .p2align 4
+L(len_less64):
+ pxor %xmm0, %xmm0
+ add $64, %edi
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ movl LEN(%esp), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit):
+ sub %ecx, %eax
+
+ test %dl, %dl
+ jz L(strnlen_exit_high)
+ mov %dl, %cl
+ and $15, %cl
+ jz L(strnlen_exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(strnlen_exit_tail1)
+ test $0x04, %dl
+ jnz L(strnlen_exit_tail2)
+ sub $4, %edi
+ jb L(return_start_len)
+ lea 3(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_8):
+ test $0x10, %dl
+ jnz L(strnlen_exit_tail4)
+ test $0x20, %dl
+ jnz L(strnlen_exit_tail5)
+ test $0x40, %dl
+ jnz L(strnlen_exit_tail6)
+ sub $8, %edi
+ jb L(return_start_len)
+ lea 7(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(strnlen_exit_high_8)
+ test $0x01, %dh
+ jnz L(strnlen_exit_tail8)
+ test $0x02, %dh
+ jnz L(strnlen_exit_tail9)
+ test $0x04, %dh
+ jnz L(strnlen_exit_tail10)
+ sub $12, %edi
+ jb L(return_start_len)
+ lea 11(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high_8):
+ test $0x10, %dh
+ jnz L(strnlen_exit_tail12)
+ test $0x20, %dh
+ jnz L(strnlen_exit_tail13)
+ test $0x40, %dh
+ jnz L(strnlen_exit_tail14)
+ sub $16, %edi
+ jb L(return_start_len)
+ lea 15(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail1):
+ sub $2, %edi
+ jb L(return_start_len)
+ lea 1(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail2):
+ sub $3, %edi
+ jb L(return_start_len)
+ lea 2(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail4):
+ sub $5, %edi
+ jb L(return_start_len)
+ lea 4(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail5):
+ sub $6, %edi
+ jb L(return_start_len)
+ lea 5(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail6):
+ sub $7, %edi
+ jb L(return_start_len)
+ lea 6(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail8):
+ sub $9, %edi
+ jb L(return_start_len)
+ lea 8(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail9):
+ sub $10, %edi
+ jb L(return_start_len)
+ lea 9(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail10):
+ sub $11, %edi
+ jb L(return_start_len)
+ lea 10(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail12):
+ sub $13, %edi
+ jb L(return_start_len)
+ lea 12(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail13):
+ sub $14, %edi
+ jb L(return_start_len)
+ lea 13(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail14):
+ sub $15, %edi
+ jb L(return_start_len)
+ lea 14(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(return_start_len):
+ movl LEN(%esp), %eax
+ RETURN
+
+/* for prolog only */
+
+ .p2align 4
+L(len_less4_prolog):
+ xor %eax, %eax
+
+ add $4, %edi
+ jz L(exit_tail0)
+
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmp $1, %edi
+ je L(exit_tail1)
+
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmp $2, %edi
+ je L(exit_tail2)
+
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmp $3, %edi
+ je L(exit_tail3)
+
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+ mov $4, %eax
+ RETURN
+
+ .p2align 4
+L(len_less8_prolog):
+ add $4, %edi
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmp $1, %edi
+ je L(exit_tail5)
+
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmp $2, %edi
+ je L(exit_tail6)
+
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmp $3, %edi
+ je L(exit_tail7)
+
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+ mov $8, %eax
+ RETURN
+
+
+ .p2align 4
+L(len_less12_prolog):
+ add $4, %edi
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmp $1, %edi
+ je L(exit_tail9)
+
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmp $2, %edi
+ je L(exit_tail10)
+
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmp $3, %edi
+ je L(exit_tail11)
+
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+ mov $12, %eax
+ RETURN
+
+ .p2align 4
+L(len_less16_prolog):
+ add $4, %edi
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmp $1, %edi
+ je L(exit_tail13)
+
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmp $2, %edi
+ je L(exit_tail14)
+
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmp $3, %edi
+ je L(exit_tail15)
+
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+ mov $16, %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(exit_tail1):
+ add $1, %eax
+ RETURN
+
+L(exit_tail2):
+ add $2, %eax
+ RETURN
+
+L(exit_tail3):
+ add $3, %eax
+ RETURN
+
+L(exit_tail4):
+ add $4, %eax
+ RETURN
+
+L(exit_tail5):
+ add $5, %eax
+ RETURN
+
+L(exit_tail6):
+ add $6, %eax
+ RETURN
+
+L(exit_tail7):
+ add $7, %eax
+ RETURN
+
+L(exit_tail8):
+ add $8, %eax
+ RETURN
+
+L(exit_tail9):
+ add $9, %eax
+ RETURN
+
+L(exit_tail10):
+ add $10, %eax
+ RETURN
+
+L(exit_tail11):
+ add $11, %eax
+ RETURN
+
+L(exit_tail12):
+ add $12, %eax
+ RETURN
+
+L(exit_tail13):
+ add $13, %eax
+ RETURN
+
+L(exit_tail14):
+ add $14, %eax
+ RETURN
+
+L(exit_tail15):
+ add $15, %eax
+# ifndef USE_AS_STRCAT
+ RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..77cf6bcdb0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,60 @@
+/* Multiple versions of strlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+ DSO. In static binaries, we need strlen before the initialization
+ happened. */
+#if defined SHARED && IS_IN (libc)
+ .text
+ENTRY(strlen)
+ .type strlen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strlen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2: ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strlen_ia32, @function; \
+ .globl __strlen_ia32; \
+ .p2align 4; \
+ __strlen_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
new file mode 100644
index 0000000000..76581eb62b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c
@@ -0,0 +1,8 @@
+#include <string.h>
+
+extern __typeof (strncasecmp) __strncasecmp_nonascii;
+
+#define __strncasecmp __strncasecmp_nonascii
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
new file mode 100644
index 0000000000..a56e63a566
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S
@@ -0,0 +1,39 @@
+/* Entry point for multi-version x86 strncasecmp.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY(__strncasecmp)
+ .type __strncasecmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ HAS_ARCH_FEATURE (Slow_SSE4_2)
+ jnz 2f
+ LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
+2: ret
+END(__strncasecmp)
+
+weak_alias (__strncasecmp, strncasecmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
new file mode 100644
index 0000000000..7e601af271
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c
@@ -0,0 +1,13 @@
+#include <string.h>
+
+extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii;
+
+#define __strncasecmp_l __strncasecmp_l_nonascii
+#define USE_IN_EXTENDED_LOCALE_MODEL 1
+#include <string/strncase.c>
+
+strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32)
+
+/* The needs of strcasecmp in libc are minimal, no need to go through
+ the IFUNC. */
+strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
new file mode 100644
index 0000000000..557210832e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
new file mode 100644
index 0000000000..d438a1ae35
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRNCASECMP_L 1
+#include "strcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
new file mode 100644
index 0000000000..8a74ee8574
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S
@@ -0,0 +1,7 @@
+/* Multiple versions of strncasecmp_l
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCMP __strncasecmp_l
+#define USE_AS_STRNCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strncasecmp_l, strncasecmp_l)
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
new file mode 100644
index 0000000000..132a000545
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_ia32
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+ __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32);
+#endif
+
+#include "string/strncat.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
new file mode 100644
index 0000000000..f1045b72b8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S
@@ -0,0 +1,4 @@
+#define STRCAT __strncat_sse2
+#define USE_AS_STRNCAT
+
+#include "strcat-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000000..625b90a978
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S
@@ -0,0 +1,4 @@
+#define STRCAT __strncat_ssse3
+#define USE_AS_STRNCAT
+
+#include "strcat-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
new file mode 100644
index 0000000000..5c1bf41453
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncat
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
new file mode 100644
index 0000000000..cc059da494
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c
@@ -0,0 +1,8 @@
+#ifdef SHARED
+# define STRNCMP __strncmp_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32);
+#endif
+
+#include "string/strncmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
new file mode 100644
index 0000000000..cf14dfaf6c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP __strncmp_sse4_2
+# include "strcmp-sse4.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..536c8685f2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S
@@ -0,0 +1,5 @@
+#ifdef SHARED
+# define USE_AS_STRNCMP
+# define STRCMP __strncmp_ssse3
+# include "strcmp-ssse3.S"
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
new file mode 100644
index 0000000000..150d4786d2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncmp
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STRNCMP
+#define STRCMP strncmp
+#include "strcmp.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..201e3f98b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000000..bdd99239a4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000000..bf82ee447d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000000..9c257efc6e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strncpy
+ All versions must be listed in ifunc-impl-list.c. */
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 0000000000..351e939a93
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,10 @@
+#define STRNLEN __strnlen_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \
+ strong_alias (__strnlen_ia32, __strnlen_ia32_1); \
+ __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1);
+#endif
+
+#include "string/strnlen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000000..56b6ae2a5c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 0000000000..d241522c70
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__strnlen)
+ .type __strnlen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2: ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..7201d6376f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,5 @@
+/* Multiple versions of strpbrk
+ All versions must be listed in ifunc-impl-list.c. */
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
new file mode 100644
index 0000000000..39a7c8825b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S
@@ -0,0 +1,282 @@
+/* strrchr with SSE2 with bsf and bsr
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ .text
+ENTRY (__strrchr_sse2_bsf)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ PUSH (%edi)
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ pshufd $0, %xmm1, %xmm1
+ ja L(crosscashe)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ and $-16, %edi
+ add $16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_return_value1):
+ bsf %edx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_null)
+ bsr %eax, %eax
+ add %edi, %eax
+ POP (%edi)
+ ret
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %edx, %edx
+ jnz L(unaligned_return_value1)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ lea 16(%edi), %esi
+ and $-16, %edi
+ add $16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+ L(crosscashe):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ add $16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_return_value):
+ add %ecx, %edi
+ bsf %edx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_null)
+ bsr %eax, %eax
+ add %edi, %eax
+ POP (%edi)
+ ret
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(unaligned_return_value)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ add $16, %edi
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %ebx, %ebx
+ jz L(return_null_1)
+ bsr %ebx, %eax
+ add %esi, %eax
+
+ POP (%ebx)
+ POP (%esi)
+
+ sub $16, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(return_value_1)
+ mov %eax, %ebx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(return_value_1):
+ bsf %ecx, %ecx
+ mov $2, %edx
+ shl %cl, %edx
+ sub $1, %edx
+ and %edx, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+
+ bsr %eax, %eax
+ add %edi, %eax
+ sub $16, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+/* Return NULL. */
+ .p2align 4
+L(return_null_1):
+ POP (%ebx)
+ POP (%esi)
+ POP (%edi)
+ xor %eax, %eax
+ ret
+
+END (__strrchr_sse2_bsf)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000000..20934288be
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -0,0 +1,708 @@
+/* strrchr SSE2 without bsf and bsr
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH(%edi);
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__strrchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ pshufd $0, %xmm1, %xmm1
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %ebx, %ebx
+ jz L(return_null_1)
+ mov %ebx, %eax
+ mov %esi, %edi
+
+ POP (%ebx)
+ POP (%esi)
+
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%ebx)
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+ mov %eax, %ebx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(find_zero_8)
+ test $0x01, %cl
+ jnz L(FindZeroExit1)
+ test $0x02, %cl
+ jnz L(FindZeroExit2)
+ test $0x04, %cl
+ jnz L(FindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_8):
+ test $0x10, %cl
+ jnz L(FindZeroExit5)
+ test $0x20, %cl
+ jnz L(FindZeroExit6)
+ test $0x40, %cl
+ jnz L(FindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(find_zero_high_8)
+ test $0x01, %ch
+ jnz L(FindZeroExit9)
+ test $0x02, %ch
+ jnz L(FindZeroExit10)
+ test $0x04, %ch
+ jnz L(FindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high_8):
+ test $0x10, %ch
+ jnz L(FindZeroExit13)
+ test $0x20, %ch
+ jnz L(FindZeroExit14)
+ test $0x40, %ch
+ jnz L(FindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit1):
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_exit)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+
+ .p2align 4
+L(match_exit):
+ test %ah, %ah
+ jnz L(match_exit_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(match_exit_8)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x02, %al
+ jnz L(Exit2)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_8):
+ test $0x80, %al
+ jnz L(Exit8)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x20, %al
+ jnz L(Exit6)
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(match_exit_high_8)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x02, %ah
+ jnz L(Exit10)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_exit_high_8):
+ test $0x80, %ah
+ jnz L(Exit16)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x20, %ah
+ jnz L(Exit14)
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea -15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea -14(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea -13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea -11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea -10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ lea -9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea -7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea -6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea -5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea -3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea -2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ lea -1(%edi), %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(prolog_find_zero_8)
+ test $0x01, %cl
+ jnz L(PrologFindZeroExit1)
+ test $0x02, %cl
+ jnz L(PrologFindZeroExit2)
+ test $0x04, %cl
+ jnz L(PrologFindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_8):
+ test $0x10, %cl
+ jnz L(PrologFindZeroExit5)
+ test $0x20, %cl
+ jnz L(PrologFindZeroExit6)
+ test $0x40, %cl
+ jnz L(PrologFindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(prolog_find_zero_high_8)
+ test $0x01, %ch
+ jnz L(PrologFindZeroExit9)
+ test $0x02, %ch
+ jnz L(PrologFindZeroExit10)
+ test $0x04, %ch
+ jnz L(PrologFindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high_8):
+ test $0x10, %ch
+ jnz L(PrologFindZeroExit13)
+ test $0x20, %ch
+ jnz L(PrologFindZeroExit14)
+ test $0x40, %ch
+ jnz L(PrologFindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit1):
+ and $1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jnz L(match_exit)
+ xor %eax, %eax
+ RETURN
+
+END (__strrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
new file mode 100644
index 0000000000..d9281eaeae
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S
@@ -0,0 +1,57 @@
+/* Multiple versions of strrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(strrchr)
+ .type strrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+ HAS_ARCH_FEATURE (Slow_BSF)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2: ret
+END(strrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strrchr_ia32, @function; \
+ .globl __strrchr_ia32; \
+ .p2align 4; \
+ __strrchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strrchr; __GI_strrchr = __strrchr_ia32
+#endif
+
+#include "../../strrchr.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..1269062381
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strspn
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__strspn_ia32)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__strspn_sse42)
+2: ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strspn_ia32, @function; \
+ .globl __strspn_ia32; \
+ .p2align 4; \
+__strspn_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#include "../../strspn.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
new file mode 100644
index 0000000000..593cfec273
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/test-multiarch.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
new file mode 100644
index 0000000000..7760b966e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000000..38d41d04de
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,22 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# undef libc_hidden_weak
+# define libc_hidden_weak(name)
+
+# undef weak_alias
+# define weak_alias(name,alias)
+
+# ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \
+ strong_alias (__wcschr_ia32, __wcschr_ia32_1); \
+ __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1);
+# endif
+#endif
+
+extern __typeof (wcschr) __wcschr_ia32;
+
+#define WCSCHR __wcschr_ia32
+#include <wcsmbs/wcschr.c>
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000000..9ff6c3b8d6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,219 @@
+/* wcschr with SSE2, without using bsf instructions
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcschr_sse2)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %eax
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+ and $63, %eax
+ cmp $48, %eax
+ ja L(cross_cache)
+
+ movdqu (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ and $-16, %ecx
+ jmp L(loop)
+
+ .p2align 4
+L(cross_cache):
+ PUSH (%edi)
+ mov %ecx, %edi
+ mov %eax, %ecx
+ and $-16, %edi
+ and $15, %ecx
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+
+ add %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jz L(match_case1)
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_no_match):
+ mov %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ pxor %xmm2, %xmm2
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ add $16, %ecx
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_4):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+ test $15, %ah
+ jnz L(match_case2_12)
+ test $15, %dh
+ jnz L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_12):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(exit0)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(exit3)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit0):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit3):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000000..d3c65a6436
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcschr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__wcschr)
+ .type wcschr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2: ret
+END(__wcschr)
+weak_alias (__wcschr, wcschr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
new file mode 100644
index 0000000000..e3337d77e2
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c
@@ -0,0 +1,14 @@
+#include <wchar.h>
+
+#define WCSCMP __wcscmp_ia32
+#ifdef SHARED
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32);
+#endif
+#undef weak_alias
+#define weak_alias(name, alias)
+
+extern __typeof (wcscmp) __wcscmp_ia32;
+
+#include "wcsmbs/wcscmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..a464b58204
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -0,0 +1,1018 @@
+/* wcscmp with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
+ .text
+ENTRY (__wcscmp_sse2)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %eax
+
+ mov (%eax), %ecx
+ cmp %ecx, (%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 4(%eax), %ecx
+ cmp %ecx, 4(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 8(%eax), %ecx
+ cmp %ecx, 8(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 12(%eax), %ecx
+ cmp %ecx, 12(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ ENTRANCE
+ add $16, %eax
+ add $16, %edx
+
+ mov %eax, %esi
+ mov %edx, %edi
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ mov %al, %ch
+ mov %dl, %cl
+ and $63, %eax /* esi alignment in cache line */
+ and $63, %edx /* edi alignment in cache line */
+ and $15, %cl
+ jz L(continue_00)
+ cmp $16, %edx
+ jb L(continue_0)
+ cmp $32, %edx
+ jb L(continue_16)
+ cmp $48, %edx
+ jb L(continue_32)
+
+L(continue_48):
+ and $15, %ch
+ jz L(continue_48_00)
+ cmp $16, %eax
+ jb L(continue_0_48)
+ cmp $32, %eax
+ jb L(continue_16_48)
+ cmp $48, %eax
+ jb L(continue_32_48)
+
+ .p2align 4
+L(continue_48_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_48)
+
+L(continue_0):
+ and $15, %ch
+ jz L(continue_0_00)
+ cmp $16, %eax
+ jb L(continue_0_0)
+ cmp $32, %eax
+ jb L(continue_0_16)
+ cmp $48, %eax
+ jb L(continue_0_32)
+
+ .p2align 4
+L(continue_0_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_0_48)
+
+ .p2align 4
+L(continue_00):
+ and $15, %ch
+ jz L(continue_00_00)
+ cmp $16, %eax
+ jb L(continue_00_0)
+ cmp $32, %eax
+ jb L(continue_00_16)
+ cmp $48, %eax
+ jb L(continue_00_32)
+
+ .p2align 4
+L(continue_00_48):
+ pcmpeqd (%edi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_32):
+ and $15, %ch
+ jz L(continue_32_00)
+ cmp $16, %eax
+ jb L(continue_0_32)
+ cmp $32, %eax
+ jb L(continue_16_32)
+ cmp $48, %eax
+ jb L(continue_32_32)
+
+ .p2align 4
+L(continue_32_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results */
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_16):
+ and $15, %ch
+ jz L(continue_16_00)
+ cmp $16, %eax
+ jb L(continue_0_16)
+ cmp $32, %eax
+ jb L(continue_16_16)
+ cmp $48, %eax
+ jb L(continue_16_32)
+
+ .p2align 4
+L(continue_16_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_00_00):
+ movdqa (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqa 16(%edi), %xmm3
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqa 32(%edi), %xmm5
+ pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm5 /* packed sub of comparison results*/
+ pmovmskb %xmm5, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqa 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_00)
+
+ .p2align 4
+L(continue_00_32):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_16):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_0):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_48_00):
+ pcmpeqd (%esi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_16_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_0_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_16_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_0):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_0_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_16_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(less4_double_words1):
+ cmp (%esi), %eax
+ jne L(nequal)
+ test %eax, %eax
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(less4_double_words):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_16):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_16)
+ and $15, %dl
+ jz L(second_double_word_16)
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_16):
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_16):
+ and $15, %dh
+ jz L(fourth_double_word_16)
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_16):
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_32):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_32)
+ and $15, %dl
+ jz L(second_double_word_32)
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_32):
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_32):
+ and $15, %dh
+ jz L(fourth_double_word_32)
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_32):
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_48):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_48)
+ and $15, %dl
+ jz L(second_double_word_48)
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_48):
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_48):
+ and $15, %dh
+ jz L(fourth_double_word_48)
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_48):
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(return)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(return):
+ RETURN
+
+ .p2align 4
+L(equal):
+ xorl %eax, %eax
+ RETURN
+
+ CFI_POP (%edi)
+ CFI_POP (%esi)
+
+ .p2align 4
+L(neq):
+ mov $1, %eax
+ jg L(neq_bigger)
+ neg %eax
+
+L(neq_bigger):
+ ret
+
+ .p2align 4
+L(eq):
+ xorl %eax, %eax
+ ret
+
+END (__wcscmp_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
new file mode 100644
index 0000000000..7118bdd4db
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -0,0 +1,39 @@
+/* Multiple versions of wcscmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc and for the
+ DSO. In static binaries, we need wcscmp before the initialization
+ happened. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcscmp)
+ .type __wcscmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2: ret
+END(__wcscmp)
+weak_alias (__wcscmp, wcscmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..fb3000392b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcscpy __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..6280ba92ab
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,600 @@
+/* wcscpy with SSSE3
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__wcscpy_ssse3)
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmp $0, (%ecx)
+ jz L(ExitTail4)
+ cmp $0, 4(%ecx)
+ jz L(ExitTail8)
+ cmp $0, 8(%ecx)
+ jz L(ExitTail12)
+ cmp $0, 12(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+ PUSH (%esi)
+ lea 16(%ecx), %esi
+
+ and $-16, %esi
+
+ pxor %xmm0, %xmm0
+ pcmpeqd (%esi), %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+ jz L(Align16Both)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $8, %eax
+ je L(Shl8)
+ jmp L(Shl12)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqd %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqd %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ mov $-0x40, %esi
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ POP (%esi)
+ add $12, %edx
+ add $12, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ POP (%esi)
+ add $8, %edx
+ add $8, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit12)
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000000..cfc97dd87c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,36 @@
+/* Multiple versions of wcscpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(wcscpy)
+ .type wcscpy, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2: ret
+END(wcscpy)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 0000000000..a335dc0f7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WCSLEN __wcslen_ia32
+#endif
+
+extern __typeof (wcslen) __wcslen_ia32;
+
+#include "wcsmbs/wcslen.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000000..bd3fc4c79b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,193 @@
+/* wcslen with SSE2
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define STR 4
+
+ .text
+ENTRY (__wcslen_sse2)
+ mov STR(%esp), %edx
+
+ cmp $0, (%edx)
+ jz L(exit_tail0)
+ cmp $0, 4(%edx)
+ jz L(exit_tail1)
+ cmp $0, 8(%edx)
+ jz L(exit_tail2)
+ cmp $0, 12(%edx)
+ jz L(exit_tail3)
+ cmp $0, 16(%edx)
+ jz L(exit_tail4)
+ cmp $0, 20(%edx)
+ jz L(exit_tail5)
+ cmp $0, 24(%edx)
+ jz L(exit_tail6)
+ cmp $0, 28(%edx)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%edx), %eax
+ lea 16(%edx), %ecx
+ and $-16, %eax
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%eax), %eax
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ jmp L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ shr $2, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_1)
+ ret
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_3)
+ add $2, %eax
+ ret
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ add $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail0):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_tail1):
+ mov $1, %eax
+ ret
+
+ .p2align 4
+L(exit_tail2):
+ mov $2, %eax
+ ret
+
+ .p2align 4
+L(exit_tail3):
+ mov $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail4):
+ mov $4, %eax
+ ret
+
+ .p2align 4
+L(exit_tail5):
+ mov $5, %eax
+ ret
+
+ .p2align 4
+L(exit_tail6):
+ mov $6, %eax
+ ret
+
+ .p2align 4
+L(exit_tail7):
+ mov $7, %eax
+ ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 0000000000..6ef9b6e7b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,37 @@
+/* Multiple versions of wcslen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(__wcslen)
+ .type __wcslen, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2: ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000000..8d8a335b5b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#if IS_IN (libc)
+# define wcsrchr __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000000..1a9b60e55e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,354 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH (%edi);
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %edi
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+
+/* Save current match */
+ mov %eax, %edx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm3
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+
+ mov %eax, %edx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm3
+ pcmpeqd %xmm3, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm4
+ pcmpeqd %xmm4, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm4
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm4, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm5
+ pcmpeqd %xmm5, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm5
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm5, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %edx, %edx
+ jz L(return_null_1)
+ mov %edx, %eax
+ mov %esi, %edi
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+/* save match info */
+ mov %eax, %edx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_fourth_wchar):
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match_second_wchar):
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_or_fourth_wchar):
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_wchar):
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_fourth_wchar):
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(prolog_find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_null)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_null)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(prolog_find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_null)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000000..cf67333995
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,35 @@
+/* Multiple versions of wcsrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if IS_IN (libc)
+ .text
+ENTRY(wcsrchr)
+ .type wcsrchr, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+ HAS_CPU_FEATURE (SSE2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2: ret
+END(wcsrchr)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000000..75ab4b94c1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,9 @@
+#include <wchar.h>
+
+#if IS_IN (libc)
+# define WMEMCMP __wmemcmp_ia32
+#endif
+
+extern __typeof (wmemcmp) __wmemcmp_ia32;
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000000..1a857c7e21
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000000..a41ef95fc1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 0000000000..1b9a54a413
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,40 @@
+/* Multiple versions of wmemcmp
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+ HAS_CPU_FEATURE (SSSE3)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+ HAS_CPU_FEATURE (SSE4_2)
+ jz 2f
+ LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2: ret
+END(wmemcmp)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i686/nptl/tls.h b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
new file mode 100644
index 0000000000..5b527af9d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h
@@ -0,0 +1,35 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _TLS_H
+
+/* Additional definitions for <tls.h> on i686 and up. */
+
+
+/* Macros to load from and store into segment registers. We can use
+ the 32-bit instructions. */
+#define TLS_GET_GS() \
+ ({ int __seg; __asm ("movl %%gs, %0" : "=q" (__seg)); __seg; })
+#define TLS_SET_GS(val) \
+ __asm ("movl %0, %%gs" :: "q" (val))
+
+
+/* Get the full set of definitions. */
+#include_next <tls.h>
+
+#endif /* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
new file mode 100644
index 0000000000..ce9c94d41a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S
@@ -0,0 +1,20 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define HAVE_CMOV 1
+#include <sysdeps/i386/pthread_spin_trylock.S>
diff --git a/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
new file mode 100644
index 0000000000..9b5a1b0d47
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h
@@ -0,0 +1,23 @@
+/* Define macros for stack address aliasing issues for NPTL. i686 version.
+ Copyright (C) 2014-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* What is useful is to avoid the 64k aliasing problem which reliably
+ happens if all stacks use sizes which are a multiple of 64k. Tell
+ the stack allocator to disturb this by allocation one more page if
+ necessary. */
+#define MULTI_PAGE_ALIASING 65536
diff --git a/REORG.TODO/sysdeps/i386/i686/strcmp.S b/REORG.TODO/sysdeps/i386/i686/strcmp.S
new file mode 100644
index 0000000000..1ae305912e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/strcmp.S
@@ -0,0 +1,52 @@
+/* Highly optimized version for ix86, x>=6.
+ Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (strcmp)
+
+ movl STR1(%esp), %ecx
+ movl STR2(%esp), %edx
+
+L(oop): movb (%ecx), %al
+ cmpb (%edx), %al
+ jne L(neq)
+ incl %ecx
+ incl %edx
+ testb %al, %al
+ jnz L(oop)
+
+ xorl %eax, %eax
+ /* when strings are equal, pointers rest one beyond
+ the end of the NUL terminators. */
+ ret
+
+L(neq): movl $1, %eax
+ movl $-1, %ecx
+ cmovbl %ecx, %eax
+
+ ret
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
new file mode 100644
index 0000000000..51f03fe77b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h
@@ -0,0 +1,44 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdint.h>
+#ifndef __SSE__
+#include_next <tst-stack-align.h>
+#else
+#include <xmmintrin.h>
+
+#define TEST_STACK_ALIGN() \
+ ({ \
+ __m128 _m; \
+ double _d = 12.0; \
+ long double _ld = 15.0; \
+ int _ret = 0; \
+ printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \
+ if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
+ if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
+ if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
+ _ret = 1; \
+ _ret; \
+ })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/i786/Implies b/REORG.TODO/sysdeps/i386/i786/Implies
new file mode 100644
index 0000000000..1cd29f63cf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/i786/Implies
@@ -0,0 +1,2 @@
+# The PPro and PII cores are mostly the same.
+i386/i686
diff --git a/REORG.TODO/sysdeps/i386/init-arch.h b/REORG.TODO/sysdeps/i386/init-arch.h
new file mode 100644
index 0000000000..72881c5679
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/init-arch.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MINIMUM_ISA 486
+#include <sysdeps/x86/init-arch.h>
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
new file mode 100644
index 0000000000..1c95db7287
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h
@@ -0,0 +1,25 @@
+/* Private macros for accessing __jmp_buf contents. i386 version.
+ Copyright (C) 2006-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define JB_BX 0
+#define JB_SI 1
+#define JB_DI 2
+#define JB_BP 3
+#define JB_SP 4
+#define JB_PC 5
+#define JB_SIZE 24
diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
new file mode 100644
index 0000000000..0a63a832cc
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <setjmp.h>
+#include <jmpbuf-offsets.h>
+#include <stdint.h>
+#include <unwind.h>
+#include <sysdep.h>
+
+/* Test if longjmp to JMPBUF would unwind the frame
+ containing a local variable at ADDRESS. */
+#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \
+ ((void *) (address) < (void *) demangle ((jmpbuf)[JB_SP]))
+
+#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \
+ _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj)
+
+static inline uintptr_t __attribute__ ((unused))
+_jmpbuf_sp (__jmp_buf regs)
+{
+ uintptr_t sp = regs[JB_SP];
+#ifdef PTR_DEMANGLE
+ PTR_DEMANGLE (sp);
+#endif
+ return sp;
+}
+
+#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
+ ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
+
+/* We use the normal longjmp for unwinding. */
+#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val)
diff --git a/REORG.TODO/sysdeps/i386/ldbl2mpn.c b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
new file mode 100644
index 0000000000..076be0ae7e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldbl2mpn.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#include <ieee754.h>
+#include <float.h>
+#include <stdlib.h>
+
+/* Convert a `long double' in IEEE854 standard double-precision format to a
+ multi-precision integer representing the significand scaled up by its
+ number of bits (64 for long double) and an integral power of two
+ (MPN frexpl). */
+
+mp_size_t
+__mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
+ int *expt, int *is_neg,
+ long double value)
+{
+ union ieee854_long_double u;
+ u.d = value;
+
+ *is_neg = u.ieee.negative;
+ *expt = (int) u.ieee.exponent - IEEE854_LONG_DOUBLE_BIAS;
+
+#if BITS_PER_MP_LIMB == 32
+ res_ptr[0] = u.ieee.mantissa1; /* Low-order 32 bits of fraction. */
+ res_ptr[1] = u.ieee.mantissa0; /* High-order 32 bits. */
+ #define N 2
+#elif BITS_PER_MP_LIMB == 64
+ /* Hopefully the compiler will combine the two bitfield extracts
+ and this composition into just the original quadword extract. */
+ res_ptr[0] = ((mp_limb_t) u.ieee.mantissa0 << 32) | u.ieee.mantissa1;
+ #define N 1
+#else
+ #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
+#endif
+
+ if (u.ieee.exponent == 0)
+ {
+ /* A biased exponent of zero is a special case.
+ Either it is a zero or it is a denormal number. */
+ if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2. */
+ /* It's zero. */
+ *expt = 0;
+ else
+ {
+ /* It is a denormal number, meaning it has no implicit leading
+ one bit, and its exponent is in fact the format minimum. */
+ int cnt;
+
+ /* One problem with Intel's 80-bit format is that the explicit
+ leading one in the normalized representation has to be zero
+ for denormalized number. If it is one, the number is according
+ to Intel's specification an invalid number. We make the
+ representation unique by explicitly clearing this bit. */
+ res_ptr[N - 1] &= ~((mp_limb_t) 1 << ((LDBL_MANT_DIG - 1) % BITS_PER_MP_LIMB));
+
+ if (res_ptr[N - 1] != 0)
+ {
+ count_leading_zeros (cnt, res_ptr[N - 1]);
+ if (cnt != 0)
+ {
+#if N == 2
+ res_ptr[N - 1] = res_ptr[N - 1] << cnt
+ | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
+ res_ptr[0] <<= cnt;
+#else
+ res_ptr[N - 1] <<= cnt;
+#endif
+ }
+ *expt = LDBL_MIN_EXP - 1 - cnt;
+ }
+ else if (res_ptr[0] != 0)
+ {
+ count_leading_zeros (cnt, res_ptr[0]);
+ res_ptr[N - 1] = res_ptr[0] << cnt;
+ res_ptr[0] = 0;
+ *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt;
+ }
+ else
+ {
+ /* This is the special case of the pseudo denormal number
+ with only the implicit leading bit set. The value is
+ in fact a normal number and so we have to treat this
+ case differently. */
+#if N == 2
+ res_ptr[N - 1] = 0x80000000ul;
+#else
+ res_ptr[0] = 0x8000000000000000ul;
+#endif
+ *expt = LDBL_MIN_EXP - 1;
+ }
+ }
+ }
+ else if (u.ieee.exponent < 0x7fff
+#if N == 2
+ && res_ptr[0] == 0
+#endif
+ && res_ptr[N - 1] == 0)
+ /* Pseudo zero. */
+ *expt = 0;
+
+ return N;
+}
diff --git a/REORG.TODO/sysdeps/i386/ldsodefs.h b/REORG.TODO/sysdeps/i386/ldsodefs.h
new file mode 100644
index 0000000000..a369f5fc68
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/ldsodefs.h
@@ -0,0 +1,41 @@
+/* Run-time dynamic linker data structures for loaded ELF shared objects.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _I386_LDSODEFS_H
+#define _I386_LDSODEFS_H 1
+
+#include <elf.h>
+#include <cpu-features.h>
+
+struct La_i86_regs;
+struct La_i86_retval;
+
+#define ARCH_PLTENTER_MEMBERS \
+ Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
+ uintptr_t *, struct La_i86_regs *, \
+ unsigned int *, const char *name, \
+ long int *framesizep)
+
+#define ARCH_PLTEXIT_MEMBERS \
+ unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
+ uintptr_t *, const struct La_i86_regs *, \
+ struct La_i86_retval *, const char *)
+
+#include_next <ldsodefs.h>
+
+#endif
diff --git a/REORG.TODO/sysdeps/i386/link-defines.sym b/REORG.TODO/sysdeps/i386/link-defines.sym
new file mode 100644
index 0000000000..0995adb37f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/link-defines.sym
@@ -0,0 +1,20 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+LONG_DOUBLE_SIZE sizeof (long double)
+
+LR_SIZE sizeof (struct La_i86_regs)
+LR_EDX_OFFSET offsetof (struct La_i86_regs, lr_edx)
+LR_ECX_OFFSET offsetof (struct La_i86_regs, lr_ecx)
+LR_EAX_OFFSET offsetof (struct La_i86_regs, lr_eax)
+LR_EBP_OFFSET offsetof (struct La_i86_regs, lr_ebp)
+LR_ESP_OFFSET offsetof (struct La_i86_regs, lr_esp)
+
+LRV_SIZE sizeof (struct La_i86_retval)
+LRV_EAX_OFFSET offsetof (struct La_i86_retval, lrv_eax)
+LRV_EDX_OFFSET offsetof (struct La_i86_retval, lrv_edx)
+LRV_ST0_OFFSET offsetof (struct La_i86_retval, lrv_st0)
+LRV_ST1_OFFSET offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/REORG.TODO/sysdeps/i386/lshift.S b/REORG.TODO/sysdeps/i386/lshift.S
new file mode 100644
index 0000000000..fa4b07793f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/lshift.S
@@ -0,0 +1,103 @@
+/* i80386 __mpn_lshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+12 /* space for 3 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_lshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 8)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 4)
+ movl SIZE(%esp),%edx
+ movl CNT(%esp),%ecx
+ subl $4,%esi /* adjust s_ptr */
+
+ movl (%esi,%edx,4),%ebx /* read most significant limb */
+ cfi_rel_offset (ebx, 0)
+ cfi_remember_state
+ xorl %eax,%eax
+ shldl %cl,%ebx,%eax /* compute carry limb */
+ decl %edx
+ jz L(end)
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+ testb $1,%dl
+ jnz L(1) /* enter loop in the middle */
+ movl %ebx,%eax
+
+ ALIGN (3)
+L(oop): movl (%esi,%edx,4),%ebx /* load next lower limb */
+ shldl %cl,%ebx,%eax /* compute result limb */
+ movl %eax,(%edi,%edx,4) /* store it */
+ decl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shldl %cl,%eax,%ebx
+ movl %ebx,(%edi,%edx,4)
+ decl %edx
+ jnz L(oop)
+
+ shll %cl,%eax /* compute least significant limb */
+ movl %eax,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_restore_state
+L(end): shll %cl,%ebx /* compute least significant limb */
+ movl %ebx,(%edi) /* store it */
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_lshift)
diff --git a/REORG.TODO/sysdeps/i386/machine-gmon.h b/REORG.TODO/sysdeps/i386/machine-gmon.h
new file mode 100644
index 0000000000..d5d8cdf7c6
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/machine-gmon.h
@@ -0,0 +1,40 @@
+/* i386-specific implementation of profiling support.
+ Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* We need a special version of the `mcount' function since for ix86 it
+ must not clobber any register. This has several reasons:
+ - there is a bug in gcc as of version 2.7.2.2 which prohibits the
+ use of profiling together with nested functions
+ - the ELF `fixup' function uses GCC's regparm feature
+ - some (future) systems might want to pass parameters in registers. */
+
+/* We must not pollute the global namespace. */
+#define mcount_internal __mcount_internal
+
+extern void mcount_internal (u_long frompc, u_long selfpc) internal_function;
+
+#define _MCOUNT_DECL(frompc, selfpc) \
+void internal_function mcount_internal (u_long frompc, u_long selfpc)
+
+
+/* Define MCOUNT as empty since we have the implementation in another
+ file. */
+#define MCOUNT
diff --git a/REORG.TODO/sysdeps/i386/memchr.S b/REORG.TODO/sysdeps/i386/memchr.S
new file mode 100644
index 0000000000..db4a6418ff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memchr.S
@@ -0,0 +1,322 @@
+/* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR
+ less than LEN. For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+ This version is developed using the same algorithm as the fast C
+ version which carries the following introduction:
+ Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+ with help from Dan Sahlin (dan@sics.se) and
+ commentary by Jim Blandy (jimb@ai.mit.edu);
+ adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+ and implemented by Roland McGrath (roland@ai.mit.edu).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+#define LEN CHR+4
+
+ .text
+ENTRY (__memchr)
+
+ /* Save callee-safe registers used in this function. */
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+
+ /* Load parameters into registers. */
+ movl STR(%esp), %eax /* str: pointer to memory block. */
+ movl CHR(%esp), %edx /* c: byte we are looking for. */
+ movl LEN(%esp), %esi /* len: length of memory block. */
+ cfi_rel_offset (esi, 4)
+
+ /* If my must not test more than three characters test
+ them one by one. This is especially true for 0. */
+ cmpl $4, %esi
+ jb L(3)
+
+ /* At the moment %edx contains CHR. What we need for the
+ algorithm is CHR in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* Now it is 0|0|c|c */
+ movl %edx, %ecx
+ shll $16, %edx /* Now c|c|0|0 */
+ movw %cx, %dx /* And finally c|c|c|c */
+
+ /* Better performance can be achieved if the word (32
+ bit) memory access is aligned on a four-byte-boundary.
+ So process first bytes one by one until boundary is
+ reached. Don't use a loop for better performance. */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(2) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+ decl %esi /* decrement length counter */
+ je L(4) /* len==0 => return NULL */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(2) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+ decl %esi /* decrement length counter */
+ je L(4) /* len==0 => return NULL */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(2) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+ decl %esi /* decrement length counter */
+ /* no test for len==0 here, because this is done in the
+ loop head */
+ jmp L(2)
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for CHR, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is CHR. This turns each byte that is CHR
+ into a zero. */
+
+
+ /* Each round the main loop processes 16 bytes. */
+
+ ALIGN (4)
+
+L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+ jnc L(8)
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is CHR we don't get 0 in %edi. */
+ jnz L(8) /* found it => return pointer */
+
+ /* This process is unfolded four times for better performance.
+ we don't increment the source pointer each time. Instead we
+ use offsets and increment by 16 in each run of the loop. But
+ before probing for the matching byte we need some extra code
+ (following LL(13) below). Even the len can be compared with
+ constants instead of decrementing each time. */
+
+ movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(7) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(7) /* found it => return pointer */
+
+ movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(6) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(6) /* found it => return pointer */
+
+ movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(5) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(5) /* found it => return pointer */
+
+ /* Adjust both counters for a full round, i.e. 16 bytes. */
+ addl $16, %eax
+L(2): subl $16, %esi
+ jae L(1) /* Still more than 16 bytes remaining */
+
+ /* Process remaining bytes separately. */
+ cmpl $4-16, %esi /* rest < 4 bytes? */
+ jb L(3) /* yes, than test byte by byte */
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(8) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jne L(8) /* found it => return pointer */
+ addl $4, %eax /* adjust source pointer */
+
+ cmpl $8-16, %esi /* rest < 8 bytes? */
+ jb L(3) /* yes, than test byte by byte */
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(8) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jne L(8) /* found it => return pointer */
+ addl $4, %eax /* adjust source pointer */
+
+ cmpl $12-16, %esi /* rest < 12 bytes? */
+ jb L(3) /* yes, than test byte by byte */
+
+ movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(8) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jne L(8) /* found it => return pointer */
+ addl $4, %eax /* adjust source pointer */
+
+ /* Check the remaining bytes one by one. */
+L(3): andl $3, %esi /* mask out uninteresting bytes */
+ jz L(4) /* no remaining bytes => return NULL */
+
+ cmpb %dl, (%eax) /* compare byte with CHR */
+ je L(9) /* equal, than return pointer */
+ incl %eax /* increment source pointer */
+ decl %esi /* decrement length */
+ jz L(4) /* no remaining bytes => return NULL */
+
+ cmpb %dl, (%eax) /* compare byte with CHR */
+ je L(9) /* equal, than return pointer */
+ incl %eax /* increment source pointer */
+ decl %esi /* decrement length */
+ jz L(4) /* no remaining bytes => return NULL */
+
+ cmpb %dl, (%eax) /* compare byte with CHR */
+ je L(9) /* equal, than return pointer */
+
+L(4): /* no byte found => return NULL */
+ xorl %eax, %eax
+ jmp L(9)
+
+ /* add missing source pointer increments */
+L(5): addl $4, %eax
+L(6): addl $4, %eax
+L(7): addl $4, %eax
+
+ /* Test for the matching byte in the word. %ecx contains a NUL
+ char in the byte which originally was the byte we are looking
+ at. */
+L(8): testb %cl, %cl /* test first byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ testb %ch, %ch /* test second byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ testl $0xff0000, %ecx /* test third byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ /* No further test needed we we know it is one of the four bytes. */
+L(9): popl %edi /* pop saved registers */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+
+ ret
+END (__memchr)
+
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/i386/memcmp.S b/REORG.TODO/sysdeps/i386/memcmp.S
new file mode 100644
index 0000000000..01f8f8ef03
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcmp.S
@@ -0,0 +1,73 @@
+/* Compare two memory blocks for differences in the first COUNT bytes.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+
+ .text
+ENTRY (memcmp)
+
+ pushl %esi /* Save callee-safe registers. */
+ cfi_adjust_cfa_offset (4)
+ movl %edi, %edx /* Note that %edx is not used and can
+ so be used to save %edi. It's faster. */
+ cfi_register (edi, edx)
+
+ movl BLK1(%esp), %esi
+ cfi_rel_offset (esi, 0)
+ movl BLK2(%esp), %edi
+ movl LEN(%esp), %ecx
+
+ cld /* Set direction of comparison. */
+
+ xorl %eax, %eax /* Default result. */
+
+ repe /* Compare at most %ecx bytes. */
+ cmpsb
+ jz L(1) /* If even last byte was equal we return 0. */
+
+ /* The memory blocks are not equal. So result of the last
+ subtraction is present in the carry flag. It is set when
+ the byte in block #2 is bigger. In this case we have to
+ return -1 (=0xffffffff), else 1. */
+ sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set
+ or not depending on last subtraction. */
+
+ /* At this point %eax == 0, if the byte of block #1 was bigger, and
+ 0xffffffff if the last byte of block #2 was bigger. The latter
+ case is already correct but the former needs a little adjustment.
+ Note that the following operation does not change 0xffffffff. */
+ orb $1, %al /* Change 0 to 1. */
+
+L(1): popl %esi /* Restore registers. */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ movl %edx, %edi
+ cfi_restore (edi)
+
+ ret
+END (memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/REORG.TODO/sysdeps/i386/memcopy.h b/REORG.TODO/sysdeps/i386/memcopy.h
new file mode 100644
index 0000000000..dc6173ee29
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcopy.h
@@ -0,0 +1,92 @@
+/* memcopy.h -- definitions for memory copy functions. i386 version.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Torbjorn Granlund (tege@sics.se).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdeps/generic/memcopy.h>
+
+#undef OP_T_THRES
+#define OP_T_THRES 8
+
+#undef BYTE_COPY_FWD
+#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \
+ do { \
+ int __d0; \
+ asm volatile(/* Clear the direction flag, so copying goes forward. */ \
+ "cld\n" \
+ /* Copy bytes. */ \
+ "rep\n" \
+ "movsb" : \
+ "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) : \
+ "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \
+ "memory"); \
+ } while (0)
+
+#undef BYTE_COPY_BWD
+#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes) \
+ do \
+ { \
+ int __d0; \
+ asm volatile(/* Set the direction flag, so copying goes backwards. */ \
+ "std\n" \
+ /* Copy bytes. */ \
+ "rep\n" \
+ "movsb\n" \
+ /* Clear the dir flag. Convention says it should be 0. */ \
+ "cld" : \
+ "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) : \
+ "0" (dst_ep - 1), "1" (src_ep - 1), "2" (nbytes) : \
+ "memory"); \
+ dst_ep += 1; \
+ src_ep += 1; \
+ } while (0)
+
+#undef WORD_COPY_FWD
+#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
+ do \
+ { \
+ int __d0; \
+ asm volatile(/* Clear the direction flag, so copying goes forward. */ \
+ "cld\n" \
+ /* Copy longwords. */ \
+ "rep\n" \
+ "movsl" : \
+ "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) : \
+ "0" (dst_bp), "1" (src_bp), "2" ((nbytes) / 4) : \
+ "memory"); \
+ (nbytes_left) = (nbytes) % 4; \
+ } while (0)
+
+#undef WORD_COPY_BWD
+#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \
+ do \
+ { \
+ int __d0; \
+ asm volatile(/* Set the direction flag, so copying goes backwards. */ \
+ "std\n" \
+ /* Copy longwords. */ \
+ "rep\n" \
+ "movsl\n" \
+ /* Clear the dir flag. Convention says it should be 0. */ \
+ "cld" : \
+ "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) : \
+ "0" (dst_ep - 4), "1" (src_ep - 4), "2" ((nbytes) / 4) : \
+ "memory"); \
+ dst_ep += 4; \
+ src_ep += 4; \
+ (nbytes_left) = (nbytes) % 4; \
+ } while (0)
diff --git a/REORG.TODO/sysdeps/i386/memcpy.S b/REORG.TODO/sysdeps/i386/memcpy.S
new file mode 100644
index 0000000000..06568ea724
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy.S
@@ -0,0 +1,95 @@
+/* memcpy with REP MOVSB/STOSB
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+# define MEMCPY_CHK __memcpy_chk
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2 12
+# define STR1 STR2+4
+# define N STR1+4
+#else
+# define STR1 12
+# define STR2 STR1+4
+# define N STR2+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ PUSH (%esi)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+ movl STR2(%esp), %esi
+ mov %edi, %eax
+#ifdef USE_AS_MEMPCPY
+ add %ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %esi, %edi
+ ja L(copy_backward)
+ je L(bwd_write_0bytes)
+#endif
+
+ rep movsb
+ POP (%edi)
+ POP (%esi)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+ lea -1(%edi,%ecx), %edi
+ lea -1(%esi,%ecx), %esi
+ std
+ rep movsb
+ cld
+L(bwd_write_0bytes):
+ POP (%edi)
+ POP (%esi)
+ ret
+#endif
+
+END (MEMCPY)
+
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (MEMCPY)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memcpy_chk.S b/REORG.TODO/sysdeps/i386/memcpy_chk.S
new file mode 100644
index 0000000000..0f6f585c41
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memcpy_chk.S
@@ -0,0 +1,34 @@
+/* Checking memcpy for i386.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+ /* For libc.so this is defined in memcpy.S.
+ For libc.a, this is a separate source to avoid
+ memcpy bringing in __chk_fail and all routines
+ it calls. */
+ .text
+ENTRY (__memcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp memcpy
+END (__memcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memmove.S b/REORG.TODO/sysdeps/i386/memmove.S
new file mode 100644
index 0000000000..60a45d21e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY memmove
+#define MEMCPY_CHK __memmove_chk
+#include "memcpy.S"
diff --git a/REORG.TODO/sysdeps/i386/memmove_chk.S b/REORG.TODO/sysdeps/i386/memmove_chk.S
new file mode 100644
index 0000000000..0c7037cc05
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memmove_chk.S
@@ -0,0 +1,33 @@
+/* Checking memmove for i386
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memmove.S. For libc.a, this is a
+ separate source to avoid memmove bringing in __chk_fail and all
+ routines it calls. */
+ .text
+ENTRY (__memmove_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp memmove
+END (__memmove_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/mempcpy.S b/REORG.TODO/sysdeps/i386/mempcpy.S
new file mode 100644
index 0000000000..61addb75f4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy
+#define MEMCPY_CHK __mempcpy_chk
+#include "memcpy.S"
+
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/i386/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
new file mode 100644
index 0000000000..4d8ac5c25b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mempcpy_chk.S
@@ -0,0 +1,33 @@
+/* Checking mempcpy for i386
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in mempcpy.S. For libc.a, this is a
+ separate source to avoid mempcpy bringing in __chk_fail and all
+ routines it calls. */
+ .text
+ENTRY (__mempcpy_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp mempcpy
+END (__mempcpy_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset.S b/REORG.TODO/sysdeps/i386/memset.S
new file mode 100644
index 0000000000..46ae65d2e4
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset.S
@@ -0,0 +1,68 @@
+/* memset with REP MOVSB/STOSB
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define STR1 8
+#ifdef USE_AS_BZERO
+#define N STR1+4
+#else
+#define STR2 STR1+4
+#define N STR2+4
+#endif
+
+ .text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl STR2(%esp), %eax
+ mov %edi, %edx
+#endif
+ rep stosb
+#ifndef USE_AS_BZERO
+ mov %edx, %eax
+#endif
+ POP (%edi)
+ ret
+END (memset)
+
+#ifndef USE_AS_BZERO
+libc_hidden_builtin_def (memset)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memset_chk.S b/REORG.TODO/sysdeps/i386/memset_chk.S
new file mode 100644
index 0000000000..da7837111e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memset_chk.S
@@ -0,0 +1,33 @@
+/* Checking memset for i386.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memset.S. For libc.a, this is a
+ separate source to avoid memset bringing in __chk_fail and all
+ routines it calls. */
+ .text
+ENTRY (__memset_chk)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp memset
+END (__memset_chk)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/memusage.h b/REORG.TODO/sysdeps/i386/memusage.h
new file mode 100644
index 0000000000..30167be833
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/memusage.h
@@ -0,0 +1,20 @@
+/* Copyright (C) 2000-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; })
+
+#include <sysdeps/generic/memusage.h>
diff --git a/REORG.TODO/sysdeps/i386/mp_clz_tab.c b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
new file mode 100644
index 0000000000..860f98cc62
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mp_clz_tab.c
@@ -0,0 +1 @@
+/* __clz_tab not needed on i386. */
diff --git a/REORG.TODO/sysdeps/i386/mul_1.S b/REORG.TODO/sysdeps/i386/mul_1.S
new file mode 100644
index 0000000000..cf83d1b343
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/mul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ the result in a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_mul_1)
+
+ pushl %res_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %s1_ptr
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %s2_limb
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp), %res_ptr
+ cfi_rel_offset (res_ptr, 12)
+ movl S1(%esp), %s1_ptr
+ cfi_rel_offset (s1_ptr, 8)
+ movl SIZE(%esp), %size
+ movl S2LIMB(%esp), %s2_limb
+ cfi_rel_offset (s2_limb, 0)
+ leal (%res_ptr,%size,4), %res_ptr
+ leal (%s1_ptr,%size,4), %s1_ptr
+ negl %size
+ xorl %ebp, %ebp
+ cfi_rel_offset (ebp, 4)
+ ALIGN (3)
+L(oop):
+ movl (%s1_ptr,%size,4), %eax
+ mull %s2_limb
+ addl %ebp, %eax
+ movl %eax, (%res_ptr,%size,4)
+ adcl $0, %edx
+ movl %edx, %ebp
+
+ incl %size
+ jnz L(oop)
+ movl %ebp, %eax
+
+ popl %s2_limb
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s2_limb)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %s1_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (s1_ptr)
+ popl %res_ptr
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (res_ptr)
+
+ ret
+#undef size
+END (__mpn_mul_1)
diff --git a/REORG.TODO/sysdeps/i386/nptl/Makefile b/REORG.TODO/sysdeps/i386/nptl/Makefile
new file mode 100644
index 0000000000..2c61b352eb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/Makefile
@@ -0,0 +1,26 @@
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += tcb-offsets.sym
+endif
+
+ifeq ($(subdir),nptl)
+CFLAGS-pthread_create.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align2.c += -mpreferred-stack-boundary=4
+endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
new file mode 100644
index 0000000000..a1205b9698
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c
@@ -0,0 +1,19 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Not needed. pthread_spin_init is an alias for pthread_spin_unlock. */
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
new file mode 100644
index 0000000000..160244b7a8
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S
@@ -0,0 +1,37 @@
+/* Copyright (C) 2012-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <lowlevellock.h>
+
+ .globl pthread_spin_lock
+ .type pthread_spin_lock,@function
+ .align 16
+pthread_spin_lock:
+ mov 4(%esp), %eax
+1: LOCK
+ decl 0(%eax)
+ jne 2f
+ xor %eax, %eax
+ ret
+
+ .align 16
+2: rep
+ nop
+ cmpl $0, 0(%eax)
+ jg 1b
+ jmp 2b
+ .size pthread_spin_lock,.-pthread_spin_lock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
new file mode 100644
index 0000000000..b6636ae8d7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S
@@ -0,0 +1,31 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+ .globl pthread_spin_unlock
+ .type pthread_spin_unlock,@function
+ .align 16
+pthread_spin_unlock:
+ movl 4(%esp), %eax
+ movl $1, (%eax)
+ xorl %eax, %eax
+ ret
+ .size pthread_spin_unlock,.-pthread_spin_unlock
+
+ /* The implementation of pthread_spin_init is identical. */
+ .globl pthread_spin_init
+pthread_spin_init = pthread_spin_unlock
diff --git a/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
new file mode 100644
index 0000000000..54abccd11b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Default stack size. */
+#define ARCH_STACK_DEFAULT_SIZE (2 * 1024 * 1024)
+
+/* Required stack pointer alignment at beginning. SSE requires 16
+ bytes. */
+#define STACK_ALIGN 16
+
+/* Minimal stack size after allocating thread descriptor and guard size. */
+#define MINIMAL_REST_STACK 2048
+
+/* Alignment requirement for TCB.
+
+ Some processors such as Intel Atom pay a big penalty on every
+ access using a segment override if that segment's base is not
+ aligned to the size of a cache line. (See Intel 64 and IA-32
+ Architectures Optimization Reference Manual, section 13.3.3.3,
+ "Segment Base".) On such machines, a cache line is 64 bytes. */
+#define TCB_ALIGNMENT 64
+
+
+/* Location of current stack frame. */
+#define CURRENT_STACK_FRAME __builtin_frame_address (0)
diff --git a/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
new file mode 100644
index 0000000000..695a810386
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym
@@ -0,0 +1,17 @@
+#include <sysdep.h>
+#include <tls.h>
+#include <kernel-features.h>
+
+RESULT offsetof (struct pthread, result)
+TID offsetof (struct pthread, tid)
+CANCELHANDLING offsetof (struct pthread, cancelhandling)
+CLEANUP_JMP_BUF offsetof (struct pthread, cleanup_jmp_buf)
+MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads)
+SYSINFO_OFFSET offsetof (tcbhead_t, sysinfo)
+CLEANUP offsetof (struct pthread, cleanup)
+CLEANUP_PREV offsetof (struct _pthread_cleanup_buffer, __prev)
+MUTEX_FUTEX offsetof (pthread_mutex_t, __data.__lock)
+POINTER_GUARD offsetof (tcbhead_t, pointer_guard)
+#ifndef __ASSUME_PRIVATE_FUTEX
+PRIVATE_FUTEX offsetof (tcbhead_t, private_futex)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/nptl/tls.h b/REORG.TODO/sysdeps/i386/nptl/tls.h
new file mode 100644
index 0000000000..f9a6b11ecf
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/nptl/tls.h
@@ -0,0 +1,435 @@
+/* Definition for thread-local data handling. nptl/i386 version.
+ Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _TLS_H
+#define _TLS_H 1
+
+#include <dl-sysdep.h>
+#ifndef __ASSEMBLER__
+# include <stdbool.h>
+# include <stddef.h>
+# include <stdint.h>
+# include <stdlib.h>
+# include <sysdep.h>
+# include <libc-pointer-arith.h> /* For cast_to_integer. */
+# include <kernel-features.h>
+# include <dl-dtv.h>
+
+typedef struct
+{
+ void *tcb; /* Pointer to the TCB. Not necessarily the
+ thread descriptor used by libpthread. */
+ dtv_t *dtv;
+ void *self; /* Pointer to the thread descriptor. */
+ int multiple_threads;
+ uintptr_t sysinfo;
+ uintptr_t stack_guard;
+ uintptr_t pointer_guard;
+ int gscope_flag;
+#ifndef __ASSUME_PRIVATE_FUTEX
+ int private_futex;
+#else
+ int __glibc_reserved1;
+#endif
+ /* Reservation of some values for the TM ABI. */
+ void *__private_tm[4];
+ /* GCC split stack support. */
+ void *__private_ss;
+} tcbhead_t;
+
+# define TLS_MULTIPLE_THREADS_IN_TCB 1
+
+#else /* __ASSEMBLER__ */
+# include <tcb-offsets.h>
+#endif
+
+
+/* Alignment requirement for the stack. For IA-32 this is governed by
+ the SSE memory functions. */
+#define STACK_ALIGN 16
+
+#ifndef __ASSEMBLER__
+/* Get system call information. */
+# include <sysdep.h>
+
+/* The old way: using LDT. */
+
+/* Structure passed to `modify_ldt', 'set_thread_area', and 'clone' calls. */
+struct user_desc
+{
+ unsigned int entry_number;
+ unsigned long int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+ unsigned int empty:25;
+};
+
+/* Initializing bit fields is slow. We speed it up by using a union. */
+union user_desc_init
+{
+ struct user_desc desc;
+ unsigned int vals[4];
+};
+
+
+/* This is the size of the initial TCB. Can't be just sizeof (tcbhead_t),
+ because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole
+ struct pthread even when not linked with -lpthread. */
+# define TLS_INIT_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the initial TCB. */
+# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread)
+
+/* This is the size of the TCB. */
+# define TLS_TCB_SIZE sizeof (struct pthread)
+
+/* Alignment requirements for the TCB. */
+# define TLS_TCB_ALIGN __alignof__ (struct pthread)
+
+/* The TCB can have any size and the memory following the address the
+ thread pointer points to is unspecified. Allocate the TCB there. */
+# define TLS_TCB_AT_TP 1
+# define TLS_DTV_AT_TP 0
+
+/* Get the thread descriptor definition. */
+# include <nptl/descr.h>
+
+
+/* Install the dtv pointer. The pointer passed is to the element with
+ index -1 which contain the length. */
+# define INSTALL_DTV(descr, dtvp) \
+ ((tcbhead_t *) (descr))->dtv = (dtvp) + 1
+
+/* Install new dtv for current thread. */
+# define INSTALL_NEW_DTV(dtvp) \
+ ({ struct pthread *__pd; \
+ THREAD_SETMEM (__pd, header.dtv, (dtvp)); })
+
+/* Return dtv of given thread descriptor. */
+# define GET_DTV(descr) \
+ (((tcbhead_t *) (descr))->dtv)
+
+/* Macros to load from and store into segment registers. */
+# ifndef TLS_GET_GS
+# define TLS_GET_GS() \
+ ({ int __seg; __asm ("movw %%gs, %w0" : "=q" (__seg)); __seg & 0xffff; })
+# endif
+# ifndef TLS_SET_GS
+# define TLS_SET_GS(val) \
+ __asm ("movw %w0, %%gs" :: "q" (val))
+# endif
+
+#ifdef NEED_DL_SYSINFO
+# define INIT_SYSINFO \
+ _head->sysinfo = GLRO(dl_sysinfo)
+# define SETUP_THREAD_SYSINFO(pd) \
+ ((pd)->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+# define CHECK_THREAD_SYSINFO(pd) \
+ assert ((pd)->header.sysinfo == THREAD_GETMEM (THREAD_SELF, header.sysinfo))
+#else
+# define INIT_SYSINFO
+#endif
+
+#ifndef LOCK_PREFIX
+# ifdef UP
+# define LOCK_PREFIX /* nothing */
+# else
+# define LOCK_PREFIX "lock;"
+# endif
+#endif
+
+static inline void __attribute__ ((unused, always_inline))
+tls_fill_user_desc (union user_desc_init *desc,
+ unsigned int entry_number,
+ void *pd)
+{
+ desc->vals[0] = entry_number;
+ /* The 'base_addr' field. Pointer to the TCB. */
+ desc->vals[1] = (unsigned long int) pd;
+ /* The 'limit' field. We use 4GB which is 0xfffff pages. */
+ desc->vals[2] = 0xfffff;
+ /* Collapsed value of the bitfield:
+ .seg_32bit = 1
+ .contents = 0
+ .read_exec_only = 0
+ .limit_in_pages = 1
+ .seg_not_present = 0
+ .useable = 1 */
+ desc->vals[3] = 0x51;
+}
+
+/* Code to initially initialize the thread pointer. This might need
+ special attention since 'errno' is not yet available and if the
+ operation can cause a failure 'errno' must not be touched. */
+# define TLS_INIT_TP(thrdescr) \
+ ({ void *_thrdescr = (thrdescr); \
+ tcbhead_t *_head = _thrdescr; \
+ union user_desc_init _segdescr; \
+ int _result; \
+ \
+ _head->tcb = _thrdescr; \
+ /* For now the thread descriptor is at the same address. */ \
+ _head->self = _thrdescr; \
+ /* New syscall handling support. */ \
+ INIT_SYSINFO; \
+ \
+ /* Let the kernel pick a value for the 'entry_number' field. */ \
+ tls_fill_user_desc (&_segdescr, -1, _thrdescr); \
+ \
+ /* Install the TLS. */ \
+ INTERNAL_SYSCALL_DECL (err); \
+ _result = INTERNAL_SYSCALL (set_thread_area, err, 1, &_segdescr.desc); \
+ \
+ if (_result == 0) \
+ /* We know the index in the GDT, now load the segment register. \
+ The use of the GDT is described by the value 3 in the lower \
+ three bits of the segment descriptor value. \
+ \
+ Note that we have to do this even if the numeric value of \
+ the descriptor does not change. Loading the segment register \
+ causes the segment information from the GDT to be loaded \
+ which is necessary since we have changed it. */ \
+ TLS_SET_GS (_segdescr.desc.entry_number * 8 + 3); \
+ \
+ _result == 0 ? NULL \
+ : "set_thread_area failed when setting up thread-local storage\n"; })
+
+# define TLS_DEFINE_INIT_TP(tp, pd) \
+ union user_desc_init _segdescr; \
+ /* Find the 'entry_number' field that the kernel selected in TLS_INIT_TP. \
+ The first three bits of the segment register value select the GDT, \
+ ignore them. We get the index from the value of the %gs register in \
+ the current thread. */ \
+ tls_fill_user_desc (&_segdescr, TLS_GET_GS () >> 3, pd); \
+ const struct user_desc *tp = &_segdescr.desc
+
+
+/* Return the address of the dtv for the current thread. */
+# define THREAD_DTV() \
+ ({ struct pthread *__pd; \
+ THREAD_GETMEM (__pd, header.dtv); })
+
+
+/* Return the thread descriptor for the current thread.
+
+ The contained asm must *not* be marked volatile since otherwise
+ assignments like
+ pthread_descr self = thread_self();
+ do not get optimized away. */
+# define THREAD_SELF \
+ ({ struct pthread *__self; \
+ asm ("movl %%gs:%c1,%0" : "=r" (__self) \
+ : "i" (offsetof (struct pthread, header.self))); \
+ __self;})
+
+/* Magic for libthread_db to know how to do THREAD_SELF. */
+# define DB_THREAD_SELF \
+ REGISTER_THREAD_AREA (32, offsetof (struct user_regs_struct, xgs), 3) \
+ REGISTER_THREAD_AREA (64, 26 * 8, 3) /* x86-64's user_regs_struct->gs */
+
+
+/* Read member of the thread descriptor directly. */
+# define THREAD_GETMEM(descr, member) \
+ ({ __typeof (descr->member) __value; \
+ if (sizeof (__value) == 1) \
+ asm volatile ("movb %%gs:%P2,%b0" \
+ : "=q" (__value) \
+ : "0" (0), "i" (offsetof (struct pthread, member))); \
+ else if (sizeof (__value) == 4) \
+ asm volatile ("movl %%gs:%P1,%0" \
+ : "=r" (__value) \
+ : "i" (offsetof (struct pthread, member))); \
+ else \
+ { \
+ if (sizeof (__value) != 8) \
+ /* There should not be any value with a size other than 1, \
+ 4 or 8. */ \
+ abort (); \
+ \
+ asm volatile ("movl %%gs:%P1,%%eax\n\t" \
+ "movl %%gs:%P2,%%edx" \
+ : "=A" (__value) \
+ : "i" (offsetof (struct pthread, member)), \
+ "i" (offsetof (struct pthread, member) + 4)); \
+ } \
+ __value; })
+
+
+/* Same as THREAD_GETMEM, but the member offset can be non-constant. */
+# define THREAD_GETMEM_NC(descr, member, idx) \
+ ({ __typeof (descr->member[0]) __value; \
+ if (sizeof (__value) == 1) \
+ asm volatile ("movb %%gs:%P2(%3),%b0" \
+ : "=q" (__value) \
+ : "0" (0), "i" (offsetof (struct pthread, member[0])), \
+ "r" (idx)); \
+ else if (sizeof (__value) == 4) \
+ asm volatile ("movl %%gs:%P1(,%2,4),%0" \
+ : "=r" (__value) \
+ : "i" (offsetof (struct pthread, member[0])), \
+ "r" (idx)); \
+ else \
+ { \
+ if (sizeof (__value) != 8) \
+ /* There should not be any value with a size other than 1, \
+ 4 or 8. */ \
+ abort (); \
+ \
+ asm volatile ("movl %%gs:%P1(,%2,8),%%eax\n\t" \
+ "movl %%gs:4+%P1(,%2,8),%%edx" \
+ : "=&A" (__value) \
+ : "i" (offsetof (struct pthread, member[0])), \
+ "r" (idx)); \
+ } \
+ __value; })
+
+
+
+/* Set member of the thread descriptor directly. */
+# define THREAD_SETMEM(descr, member, value) \
+ ({ if (sizeof (descr->member) == 1) \
+ asm volatile ("movb %b0,%%gs:%P1" : \
+ : "iq" (value), \
+ "i" (offsetof (struct pthread, member))); \
+ else if (sizeof (descr->member) == 4) \
+ asm volatile ("movl %0,%%gs:%P1" : \
+ : "ir" (value), \
+ "i" (offsetof (struct pthread, member))); \
+ else \
+ { \
+ if (sizeof (descr->member) != 8) \
+ /* There should not be any value with a size other than 1, \
+ 4 or 8. */ \
+ abort (); \
+ \
+ asm volatile ("movl %%eax,%%gs:%P1\n\t" \
+ "movl %%edx,%%gs:%P2" : \
+ : "A" ((uint64_t) cast_to_integer (value)), \
+ "i" (offsetof (struct pthread, member)), \
+ "i" (offsetof (struct pthread, member) + 4)); \
+ }})
+
+
+/* Same as THREAD_SETMEM, but the member offset can be non-constant. */
+# define THREAD_SETMEM_NC(descr, member, idx, value) \
+ ({ if (sizeof (descr->member[0]) == 1) \
+ asm volatile ("movb %b0,%%gs:%P1(%2)" : \
+ : "iq" (value), \
+ "i" (offsetof (struct pthread, member)), \
+ "r" (idx)); \
+ else if (sizeof (descr->member[0]) == 4) \
+ asm volatile ("movl %0,%%gs:%P1(,%2,4)" : \
+ : "ir" (value), \
+ "i" (offsetof (struct pthread, member)), \
+ "r" (idx)); \
+ else \
+ { \
+ if (sizeof (descr->member[0]) != 8) \
+ /* There should not be any value with a size other than 1, \
+ 4 or 8. */ \
+ abort (); \
+ \
+ asm volatile ("movl %%eax,%%gs:%P1(,%2,8)\n\t" \
+ "movl %%edx,%%gs:4+%P1(,%2,8)" : \
+ : "A" ((uint64_t) cast_to_integer (value)), \
+ "i" (offsetof (struct pthread, member)), \
+ "r" (idx)); \
+ }})
+
+
+/* Atomic compare and exchange on TLS, returning old value. */
+#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \
+ ({ __typeof (descr->member) __ret; \
+ __typeof (oldval) __old = (oldval); \
+ if (sizeof (descr->member) == 4) \
+ asm volatile (LOCK_PREFIX "cmpxchgl %2, %%gs:%P3" \
+ : "=a" (__ret) \
+ : "0" (__old), "r" (newval), \
+ "i" (offsetof (struct pthread, member))); \
+ else \
+ /* Not necessary for other sizes in the moment. */ \
+ abort (); \
+ __ret; })
+
+
+/* Atomic logical and. */
+#define THREAD_ATOMIC_AND(descr, member, val) \
+ (void) ({ if (sizeof ((descr)->member) == 4) \
+ asm volatile (LOCK_PREFIX "andl %1, %%gs:%P0" \
+ :: "i" (offsetof (struct pthread, member)), \
+ "ir" (val)); \
+ else \
+ /* Not necessary for other sizes in the moment. */ \
+ abort (); })
+
+
+/* Atomic set bit. */
+#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \
+ (void) ({ if (sizeof ((descr)->member) == 4) \
+ asm volatile (LOCK_PREFIX "orl %1, %%gs:%P0" \
+ :: "i" (offsetof (struct pthread, member)), \
+ "ir" (1 << (bit))); \
+ else \
+ /* Not necessary for other sizes in the moment. */ \
+ abort (); })
+
+
+/* Set the stack guard field in TCB head. */
+#define THREAD_SET_STACK_GUARD(value) \
+ THREAD_SETMEM (THREAD_SELF, header.stack_guard, value)
+#define THREAD_COPY_STACK_GUARD(descr) \
+ ((descr)->header.stack_guard \
+ = THREAD_GETMEM (THREAD_SELF, header.stack_guard))
+
+
+/* Set the pointer guard field in the TCB head. */
+#define THREAD_SET_POINTER_GUARD(value) \
+ THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value)
+#define THREAD_COPY_POINTER_GUARD(descr) \
+ ((descr)->header.pointer_guard \
+ = THREAD_GETMEM (THREAD_SELF, header.pointer_guard))
+
+
+/* Get and set the global scope generation counter in the TCB head. */
+#define THREAD_GSCOPE_FLAG_UNUSED 0
+#define THREAD_GSCOPE_FLAG_USED 1
+#define THREAD_GSCOPE_FLAG_WAIT 2
+#define THREAD_GSCOPE_RESET_FLAG() \
+ do \
+ { int __res; \
+ asm volatile ("xchgl %0, %%gs:%P1" \
+ : "=r" (__res) \
+ : "i" (offsetof (struct pthread, header.gscope_flag)), \
+ "0" (THREAD_GSCOPE_FLAG_UNUSED)); \
+ if (__res == THREAD_GSCOPE_FLAG_WAIT) \
+ lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \
+ } \
+ while (0)
+#define THREAD_GSCOPE_SET_FLAG() \
+ THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
+#define THREAD_GSCOPE_WAIT() \
+ GL(dl_wait_lookup_done) ()
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* tls.h */
diff --git a/REORG.TODO/sysdeps/i386/preconfigure b/REORG.TODO/sysdeps/i386/preconfigure
new file mode 100644
index 0000000000..c8fefd1bff
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/preconfigure
@@ -0,0 +1,5 @@
+# preconfigure fragment for i386.
+
+case "$machine" in
+i[4567]86) base_machine=i386 machine=i386/$machine ;;
+esac
diff --git a/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
new file mode 100644
index 0000000000..f71a9fcb2d
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S
@@ -0,0 +1,46 @@
+/* Copyright (C) 2002-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <pthread-errnos.h>
+
+
+#ifdef UP
+# define LOCK
+#else
+# define LOCK lock
+#endif
+
+ .globl pthread_spin_trylock
+ .type pthread_spin_trylock,@function
+ .align 16
+pthread_spin_trylock:
+ movl 4(%esp), %edx
+ movl $1, %eax
+ xorl %ecx, %ecx
+ LOCK
+ cmpxchgl %ecx, (%edx)
+ movl $EBUSY, %eax
+#ifdef HAVE_CMOV
+ cmovel %ecx, %eax
+#else
+ jne 0f
+ movl %ecx, %eax
+0:
+#endif
+ ret
+ .size pthread_spin_trylock,.-pthread_spin_trylock
diff --git a/REORG.TODO/sysdeps/i386/rawmemchr.S b/REORG.TODO/sysdeps/i386/rawmemchr.S
new file mode 100644
index 0000000000..246ec3f18e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rawmemchr.S
@@ -0,0 +1,222 @@
+/* rawmemchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+ This version is developed using the same algorithm as the fast C
+ version which carries the following introduction:
+ Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
+ with help from Dan Sahlin (dan@sics.se) and
+ commentary by Jim Blandy (jimb@ai.mit.edu);
+ adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+ and implemented by Roland McGrath (roland@ai.mit.edu).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (__rawmemchr)
+
+ /* Save callee-safe register used in this function. */
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+
+ /* Load parameters into registers. */
+ movl STR(%esp), %eax
+ movl CHR(%esp), %edx
+
+ /* At the moment %edx contains C. What we need for the
+ algorithm is C in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* Now it is 0|0|c|c */
+ movl %edx, %ecx
+ shll $16, %edx /* Now c|c|0|0 */
+ movw %cx, %dx /* And finally c|c|c|c */
+
+ /* Better performance can be achieved if the word (32
+ bit) memory access is aligned on a four-byte-boundary.
+ So process first bytes one by one until boundary is
+ reached. Don't use a loop for better performance. */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(1) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(1) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ je L(1) /* yes => begin loop */
+ cmpb %dl, (%eax) /* compare byte */
+ je L(9) /* target found => return */
+ incl %eax /* increment source pointer */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for C, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is C. This turns each byte that is C
+ into a zero. */
+
+
+ /* Each round the main loop processes 16 bytes. */
+ ALIGN (4)
+
+L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+ jnc L(8)
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is C we don't get 0 in %edi. */
+ jnz L(8) /* found it => return pointer */
+
+ /* This process is unfolded four times for better performance.
+ we don't increment the source pointer each time. Instead we
+ use offsets and increment by 16 in each run of the loop. But
+ before probing for the matching byte we need some extra code
+ (following LL(13) below). Even the len can be compared with
+ constants instead of decrementing each time. */
+
+ movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(7) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(7) /* found it => return pointer */
+
+ movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(6) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(6) /* found it => return pointer */
+
+ movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(5) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(5) /* found it => return pointer */
+
+ /* Adjust both counters for a full round, i.e. 16 bytes. */
+ addl $16, %eax
+ jmp L(1)
+ /* add missing source pointer increments */
+L(5): addl $4, %eax
+L(6): addl $4, %eax
+L(7): addl $4, %eax
+
+ /* Test for the matching byte in the word. %ecx contains a NUL
+ char in the byte which originally was the byte we are looking
+ at. */
+L(8): testb %cl, %cl /* test first byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ testb %ch, %ch /* test second byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ testl $0xff0000, %ecx /* test third byte in dword */
+ jz L(9) /* if zero => return pointer */
+ incl %eax /* increment source pointer */
+
+ /* No further test needed we we know it is one of the four bytes. */
+
+L(9):
+ popl %edi /* pop saved register */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__rawmemchr)
+
+libc_hidden_def (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
diff --git a/REORG.TODO/sysdeps/i386/rshift.S b/REORG.TODO/sysdeps/i386/rshift.S
new file mode 100644
index 0000000000..cf179052b5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/rshift.S
@@ -0,0 +1,105 @@
+/* i80386 __mpn_rshift --
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+12 /* space for 3 saved regs */
+#define RES PARMS
+#define S RES+4
+#define SIZE S+4
+#define CNT SIZE+4
+
+ .text
+ENTRY (__mpn_rshift)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 8)
+ movl S(%esp),%esi
+ cfi_rel_offset (esi, 4)
+ movl SIZE(%esp),%edx
+ movl CNT(%esp),%ecx
+ leal -4(%edi,%edx,4),%edi
+ leal (%esi,%edx,4),%esi
+ negl %edx
+
+ movl (%esi,%edx,4),%ebx /* read least significant limb */
+ cfi_rel_offset (ebx, 0)
+ cfi_remember_state
+ xorl %eax,%eax
+ shrdl %cl,%ebx,%eax /* compute carry limb */
+ incl %edx
+ jz L(end)
+ pushl %eax /* push carry limb onto stack */
+ cfi_adjust_cfa_offset (4)
+ testb $1,%dl
+ jnz L(1) /* enter loop in the middle */
+ movl %ebx,%eax
+
+ ALIGN (3)
+L(oop): movl (%esi,%edx,4),%ebx /* load next higher limb */
+ shrdl %cl,%ebx,%eax /* compute result limb */
+ movl %eax,(%edi,%edx,4) /* store it */
+ incl %edx
+L(1): movl (%esi,%edx,4),%eax
+ shrdl %cl,%eax,%ebx
+ movl %ebx,(%edi,%edx,4)
+ incl %edx
+ jnz L(oop)
+
+ shrl %cl,%eax /* compute most significant limb */
+ movl %eax,(%edi) /* store it */
+
+ popl %eax /* pop carry limb */
+ cfi_adjust_cfa_offset (-4)
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_restore_state
+L(end): shrl %cl,%ebx /* compute most significant limb */
+ movl %ebx,(%edi) /* store it */
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_rshift)
diff --git a/REORG.TODO/sysdeps/i386/setfpucw.c b/REORG.TODO/sysdeps/i386/setfpucw.c
new file mode 100644
index 0000000000..40b995f18a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setfpucw.c
@@ -0,0 +1,54 @@
+/* Set the FPU control word for x86.
+ Copyright (C) 2003-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <math.h>
+#include <fpu_control.h>
+#include <fenv.h>
+#include <unistd.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+void
+__setfpucw (fpu_control_t set)
+{
+ fpu_control_t cw;
+
+ /* Fetch the current control word. */
+ __asm__ ("fnstcw %0" : "=m" (*&cw));
+
+ /* Preserve the reserved bits, and set the rest as the user
+ specified (or the default, if the user gave zero). */
+ cw &= _FPU_RESERVED;
+ cw |= set & ~_FPU_RESERVED;
+
+ __asm__ ("fldcw %0" : : "m" (*&cw));
+
+ /* If the CPU supports SSE, we set the MXCSR as well. */
+ if (HAS_CPU_FEATURE (SSE))
+ {
+ unsigned int xnew_exc;
+
+ /* Get the current MXCSR. */
+ __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc));
+
+ xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7));
+ xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7);
+
+ __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc));
+ }
+}
diff --git a/REORG.TODO/sysdeps/i386/setjmp.S b/REORG.TODO/sysdeps/i386/setjmp.S
new file mode 100644
index 0000000000..738a899e8b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/setjmp.S
@@ -0,0 +1,58 @@
+/* setjmp for i386.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <jmpbuf-offsets.h>
+#include <asm-syntax.h>
+#include <stap-probe.h>
+
+#define PARMS 4 /* no space for saved regs */
+#define JMPBUF PARMS
+#define SIGMSK JMPBUF+4
+
+ENTRY (__sigsetjmp)
+
+ movl JMPBUF(%esp), %eax
+
+ /* Save registers. */
+ movl %ebx, (JB_BX*4)(%eax)
+ movl %esi, (JB_SI*4)(%eax)
+ movl %edi, (JB_DI*4)(%eax)
+ leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_SP*4)(%eax)
+ movl 0(%esp), %ecx /* Save PC we are returning to now. */
+ LIBC_PROBE (setjmp, 3, 4@%eax, -4@SIGMSK(%esp), 4@%ecx)
+#ifdef PTR_MANGLE
+ PTR_MANGLE (%ecx)
+#endif
+ movl %ecx, (JB_PC*4)(%eax)
+ movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */
+
+#if IS_IN (rtld)
+ /* In ld.so we never save the signal mask. */
+ xorl %eax, %eax
+ ret
+#else
+ /* Make a tail call to __sigjmp_save; it takes the same args. */
+ jmp __sigjmp_save
+#endif
+END (__sigsetjmp)
+hidden_def (__sigsetjmp)
diff --git a/REORG.TODO/sysdeps/i386/stackguard-macros.h b/REORG.TODO/sysdeps/i386/stackguard-macros.h
new file mode 100644
index 0000000000..039762927c
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackguard-macros.h
@@ -0,0 +1,12 @@
+#include <stdint.h>
+
+#define STACK_CHK_GUARD \
+ ({ uintptr_t x; asm ("movl %%gs:0x14, %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+ ({ \
+ uintptr_t x; \
+ asm ("movl %%gs:%c1, %0" : "=r" (x) \
+ : "i" (offsetof (tcbhead_t, pointer_guard))); \
+ x; \
+ })
diff --git a/REORG.TODO/sysdeps/i386/stackinfo.h b/REORG.TODO/sysdeps/i386/stackinfo.h
new file mode 100644
index 0000000000..ba17867d3a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stackinfo.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains a bit of information about the stack allocation
+ of the processor. */
+
+#ifndef _STACKINFO_H
+#define _STACKINFO_H 1
+
+#include <elf.h>
+
+/* On x86 the stack grows down. */
+#define _STACK_GROWS_DOWN 1
+
+/* Default to an executable stack. PF_X can be overridden if PT_GNU_STACK is
+ * present, but it is presumed absent. */
+#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X)
+
+/* Access to the stack pointer. The macros are used in alloca_account
+ for which they need to act as barriers as well, hence the additional
+ (unnecessary) parameters. */
+#define stackinfo_get_sp() \
+ ({ void *p__; asm volatile ("mov %%esp, %0" : "=r" (p__)); p__; })
+#define stackinfo_sub_sp(ptr) \
+ ({ ptrdiff_t d__; \
+ asm volatile ("sub %%esp, %0" : "=r" (d__) : "0" (ptr)); \
+ d__; })
+
+#endif /* stackinfo.h */
diff --git a/REORG.TODO/sysdeps/i386/start.S b/REORG.TODO/sysdeps/i386/start.S
new file mode 100644
index 0000000000..ccb1e2b38f
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/start.S
@@ -0,0 +1,139 @@
+/* Startup code compliant to the ELF i386 ABI.
+ Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file with other
+ programs, and to distribute those programs without any restriction
+ coming from the use of this file. (The GNU Lesser General Public
+ License restrictions do apply in other respects; for example, they
+ cover modification of the file, and distribution when not linked
+ into another program.)
+
+ Note that people who make modified versions of this file are not
+ obligated to grant this special exception for their modified
+ versions; it is their choice whether to do so. The GNU Lesser
+ General Public License gives permission to release a modified
+ version without this exception; this exception also makes it
+ possible to release a modified version which carries forward this
+ exception.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is the canonical entry point, usually the first thing in the text
+ segment. The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry
+ point runs, most registers' values are unspecified, except for:
+
+ %edx Contains a function pointer to be registered with `atexit'.
+ This is how the dynamic linker arranges to have DT_FINI
+ functions called for shared libraries that have been loaded
+ before this code runs.
+
+ %esp The stack contains the arguments and environment:
+ 0(%esp) argc
+ 4(%esp) argv[0]
+ ...
+ (4*argc)(%esp) NULL
+ (4*(argc+1))(%esp) envp[0]
+ ...
+ NULL
+*/
+
+ .text
+ .globl _start
+ .type _start,@function
+_start:
+ /* Clear the frame pointer. The ABI suggests this be done, to mark
+ the outermost frame obviously. */
+ xorl %ebp, %ebp
+
+ /* Extract the arguments as encoded on the stack and set up
+ the arguments for `main': argc, argv. envp will be determined
+ later in __libc_start_main. */
+ popl %esi /* Pop the argument count. */
+ movl %esp, %ecx /* argv starts just at the current stack top.*/
+
+ /* Before pushing the arguments align the stack to a 16-byte
+ (SSE needs 16-byte alignment) boundary to avoid penalties from
+ misaligned accesses. Thanks to Edward Seidl <seidl@janed.com>
+ for pointing this out. */
+ andl $0xfffffff0, %esp
+ pushl %eax /* Push garbage because we allocate
+ 28 more bytes. */
+
+ /* Provide the highest stack address to the user code (for stacks
+ which grow downwards). */
+ pushl %esp
+
+ pushl %edx /* Push address of the shared library
+ termination function. */
+
+#ifdef SHARED
+ /* Load PIC register. */
+ call 1f
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+
+ /* Push address of our own entry points to .fini and .init. */
+ leal __libc_csu_fini@GOTOFF(%ebx), %eax
+ pushl %eax
+ leal __libc_csu_init@GOTOFF(%ebx), %eax
+ pushl %eax
+
+ pushl %ecx /* Push second argument: argv. */
+ pushl %esi /* Push first argument: argc. */
+
+ pushl main@GOT(%ebx)
+
+ /* Call the user's main function, and exit with its value.
+ But let the libc call main. */
+ call __libc_start_main@PLT
+#else
+ /* Push address of our own entry points to .fini and .init. */
+ pushl $__libc_csu_fini
+ pushl $__libc_csu_init
+
+ pushl %ecx /* Push second argument: argv. */
+ pushl %esi /* Push first argument: argc. */
+
+ pushl $main
+
+ /* Call the user's main function, and exit with its value.
+ But let the libc call main. */
+ call __libc_start_main
+#endif
+
+ hlt /* Crash if somehow `exit' does return. */
+
+#ifdef SHARED
+1: movl (%esp), %ebx
+ ret
+#endif
+
+/* To fulfill the System V/i386 ABI we need this symbol. Yuck, it's so
+ meaningless since we don't support machines < 80386. */
+ .section .rodata
+ .globl _fp_hw
+_fp_hw: .long 3
+ .size _fp_hw, 4
+ .type _fp_hw,@object
+
+/* Define a symbol for the first piece of initialized data. */
+ .data
+ .globl __data_start
+__data_start:
+ .long 0
+ .weak data_start
+ data_start = __data_start
diff --git a/REORG.TODO/sysdeps/i386/stpcpy.S b/REORG.TODO/sysdeps/i386/stpcpy.S
new file mode 100644
index 0000000000..d9981b677b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpcpy.S
@@ -0,0 +1,88 @@
+/* Copy SRC to DEST returning the address of the terminating '\0' in DEST.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+ also not invented here. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+
+ .text
+ENTRY (__stpcpy)
+
+ movl DEST(%esp), %eax
+ movl SRC(%esp), %ecx
+ subl %eax, %ecx /* magic: reduce number of loop variants
+ to one using addressing mode */
+
+ /* Here we would like to write
+
+ subl $4, %eax
+ ALIGN (4)
+
+ but the assembler is too smart and optimizes for the shortest
+ form where the number only needs one byte. But if we could
+ have the long form we would not need the alignment. */
+
+ .byte 0x81, 0xe8 /* This is `subl $0x00000004, %eax' */
+ .long 0x00000004
+
+ /* Four times unfolded loop with only one loop counter. This
+ is achieved by the use of index+base addressing mode. As the
+ loop counter we use the destination address because this is
+ also the result. */
+L(1): addl $4, %eax /* increment loop counter */
+
+ movb (%eax,%ecx), %dl /* load current char */
+ movb %dl, (%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(2) /* yes, then exit */
+
+ movb 1(%eax,%ecx), %dl /* load current char */
+ movb %dl, 1(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(3) /* yes, then exit */
+
+ movb 2(%eax,%ecx), %dl /* load current char */
+ movb %dl, 2(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(4) /* yes, then exit */
+
+ movb 3(%eax,%ecx), %dl /* load current char */
+ movb %dl, 3(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jnz L(1) /* no, then continue loop */
+
+ incl %eax /* correct loop counter */
+L(4): incl %eax
+L(3): incl %eax
+L(2):
+
+ ret
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/REORG.TODO/sysdeps/i386/stpncpy.S b/REORG.TODO/sysdeps/i386/stpncpy.S
new file mode 100644
index 0000000000..46f2aba713
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/stpncpy.S
@@ -0,0 +1,147 @@
+/* copy no more than N bytes from SRC to DEST, returning the address of
+ the terminating '\0' in DEST.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+ - original wrote n+1 chars in some cases.
+ - stpncpy() ought to behave like strncpy() ie. not null-terminate
+ if limited by n. glibc-1.09 stpncpy() does this.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+#define LEN SRC+4
+
+ .text
+ENTRY (__stpncpy)
+
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %eax
+ movl SRC(%esp), %esi
+ cfi_rel_offset (esi, 0)
+ movl LEN(%esp), %ecx
+
+ subl %eax, %esi /* magic: reduce number of loop variants
+ to one using addressing mode */
+ jmp L(1) /* jump to loop "head" */
+
+ ALIGN(4)
+
+ /* Four times unfolded loop with two loop counters. We get the
+ third value (the source address) by using the index+base
+ addressing mode. */
+L(2): movb (%eax,%esi), %dl /* load current char */
+ movb %dl, (%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(7) /* yes, then exit */
+
+ movb 1(%eax,%esi), %dl /* load current char */
+ movb %dl, 1(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(6) /* yes, then exit */
+
+ movb 2(%eax,%esi), %dl /* load current char */
+ movb %dl, 2(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(5) /* yes, then exit */
+
+ movb 3(%eax,%esi), %dl /* load current char */
+ movb %dl, 3(%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(4) /* yes, then exit */
+
+ addl $4, %eax /* increment loop counter for full round */
+
+L(1): subl $4, %ecx /* still more than 4 bytes allowed? */
+ jae L(2) /* yes, then go to start of loop */
+
+ /* The maximal remaining 15 bytes are not processed in a loop. */
+
+ addl $4, %ecx /* correct above subtraction */
+ jz L(9) /* maximal allowed char reached => go to end */
+
+ movb (%eax,%esi), %dl /* load current char */
+ movb %dl, (%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(3) /* yes, then exit */
+
+ incl %eax /* increment pointer */
+ decl %ecx /* decrement length counter */
+ jz L(9) /* no more allowed => exit */
+
+ movb (%eax,%esi), %dl /* load current char */
+ movb %dl, (%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(3) /* yes, then exit */
+
+ incl %eax /* increment pointer */
+ decl %ecx /* decrement length counter */
+ jz L(9) /* no more allowed => exit */
+
+ movb (%eax,%esi), %dl /* load current char */
+ movb %dl, (%eax) /* and store it */
+ testb %dl, %dl /* was it NUL? */
+ jz L(3) /* yes, then exit */
+
+ incl %eax /* increment pointer */
+ jmp L(9) /* we don't have to test for counter underflow
+ because we know we had a most 3 bytes
+ remaining => exit */
+
+ /* When coming from the main loop we have to adjust the pointer. */
+L(4): decl %ecx /* decrement counter */
+ incl %eax /* increment pointer */
+
+L(5): decl %ecx /* increment pointer */
+ incl %eax /* increment pointer */
+
+L(6): decl %ecx /* increment pointer */
+ incl %eax /* increment pointer */
+L(7):
+
+ addl $3, %ecx /* correct pre-decrementation of counter
+ at the beginning of the loop; but why 3
+ and not 4? Very simple, we have to count
+ the NUL char we already wrote. */
+ jz L(9) /* counter is also 0 => exit */
+
+ /* We now have to fill the rest of the buffer with NUL. This
+ is done in a tricky way. Please note that the addressing mode
+ used below is not the same we used above. Here we use the
+ %ecx register. */
+L(8):
+ movb $0, (%ecx,%eax) /* store NUL char */
+L(3): decl %ecx /* all bytes written? */
+ jnz L(8) /* no, then again */
+
+L(9): popl %esi /* restore saved register content */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+
+ ret
+END (__stpncpy)
+
+libc_hidden_def (__stpncpy)
+weak_alias (__stpncpy, stpncpy)
diff --git a/REORG.TODO/sysdeps/i386/strcat.S b/REORG.TODO/sysdeps/i386/strcat.S
new file mode 100644
index 0000000000..4a26b3c528
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcat.S
@@ -0,0 +1,265 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+ For Intel 80x86, x>=4.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+ Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define DEST RTN
+#define SRC DEST+4
+
+ .text
+ENTRY (strcat)
+
+ pushl %edi /* Save callee-safe register. */
+ cfi_adjust_cfa_offset (4)
+
+ movl DEST(%esp), %edx
+ movl SRC(%esp), %ecx
+
+ testb $0xff, (%ecx) /* Is source string empty? */
+ jz L(8) /* yes => return */
+
+ /* Test the first bytes separately until destination is aligned. */
+ testl $3, %edx /* destination pointer aligned? */
+ jz L(1) /* yes => begin scan loop */
+ testb $0xff, (%edx) /* is end of string? */
+ jz L(2) /* yes => start appending */
+ incl %edx /* increment source pointer */
+
+ testl $3, %edx /* destination pointer aligned? */
+ jz L(1) /* yes => begin scan loop */
+ testb $0xff, (%edx) /* is end of string? */
+ jz L(2) /* yes => start appending */
+ incl %edx /* increment source pointer */
+
+ testl $3, %edx /* destination pointer aligned? */
+ jz L(1) /* yes => begin scan loop */
+ testb $0xff, (%edx) /* is end of string? */
+ jz L(2) /* yes => start appending */
+ incl %edx /* increment source pointer */
+
+ /* Now we are aligned. Begin scan loop. */
+ jmp L(1)
+
+ cfi_rel_offset (edi, 0)
+ ALIGN(4)
+
+L(4): addl $16,%edx /* increment destination pointer for round */
+
+L(1): movl (%edx), %eax /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+
+ /* If you compare this with the algorithm in memchr.S you will
+ notice that here is an `xorl' statement missing. But you must
+ not forget that we are looking for C == 0 and `xorl $0, %eax'
+ is a no-op. */
+
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+ jnc L(3)
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is C we don't get 0 in %ecx. */
+ jnz L(3)
+
+ movl 4(%edx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(5) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(5) /* one byte is NUL => stop copying */
+
+ movl 8(%edx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(6) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(6) /* one byte is NUL => stop copying */
+
+ movl 12(%edx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(7) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(4) /* no byte is NUL => carry on copying */
+
+L(7): addl $4, %edx /* adjust source pointer */
+L(6): addl $4, %edx
+L(5): addl $4, %edx
+
+L(3): testb %al, %al /* is first byte NUL? */
+ jz L(2) /* yes => start copying */
+ incl %edx /* increment source pointer */
+
+ testb %ah, %ah /* is second byte NUL? */
+ jz L(2) /* yes => start copying */
+ incl %edx /* increment source pointer */
+
+ testl $0xff0000, %eax /* is third byte NUL? */
+ jz L(2) /* yes => start copying */
+ incl %edx /* increment source pointer */
+
+L(2): subl %ecx, %edx /* reduce number of loop variants */
+
+ /* Now we have to align the source pointer. */
+ testl $3, %ecx /* pointer correctly aligned? */
+ jz L(29) /* yes => start copy loop */
+ movb (%ecx), %al /* get first byte */
+ movb %al, (%ecx,%edx) /* and store it */
+ andb %al, %al /* is byte NUL? */
+ jz L(8) /* yes => return */
+ incl %ecx /* increment pointer */
+
+ testl $3, %ecx /* pointer correctly aligned? */
+ jz L(29) /* yes => start copy loop */
+ movb (%ecx), %al /* get first byte */
+ movb %al, (%ecx,%edx) /* and store it */
+ andb %al, %al /* is byte NUL? */
+ jz L(8) /* yes => return */
+ incl %ecx /* increment pointer */
+
+ testl $3, %ecx /* pointer correctly aligned? */
+ jz L(29) /* yes => start copy loop */
+ movb (%ecx), %al /* get first byte */
+ movb %al, (%ecx,%edx) /* and store it */
+ andb %al, %al /* is byte NUL? */
+ jz L(8) /* yes => return */
+ incl %ecx /* increment pointer */
+
+ /* Now we are aligned. */
+ jmp L(29) /* start copy loop */
+
+ ALIGN(4)
+
+L(28): movl %eax, 12(%ecx,%edx)/* store word at destination */
+ addl $16, %ecx /* adjust pointer for full round */
+
+L(29): movl (%ecx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(9) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(9) /* one byte is NUL => stop copying */
+ movl %eax, (%ecx,%edx) /* store word to destination */
+
+ movl 4(%ecx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(91) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(91) /* one byte is NUL => stop copying */
+ movl %eax, 4(%ecx,%edx) /* store word to destination */
+
+ movl 8(%ecx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(92) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(92) /* one byte is NUL => stop copying */
+ movl %eax, 8(%ecx,%edx) /* store word to destination */
+
+ movl 12(%ecx), %eax /* get word from source */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %eax, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(93) /* highest byte is C => stop copying */
+ xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(28) /* no is NUL => carry on copying */
+
+L(93): addl $4, %ecx /* adjust pointer */
+L(92): addl $4, %ecx
+L(91): addl $4, %ecx
+
+L(9): movb %al, (%ecx,%edx) /* store first byte of last word */
+ orb %al, %al /* is it NUL? */
+ jz L(8) /* yes => return */
+
+ movb %ah, 1(%ecx,%edx) /* store second byte of last word */
+ orb %ah, %ah /* is it NUL? */
+ jz L(8) /* yes => return */
+
+ shrl $16, %eax /* make upper bytes accessible */
+ movb %al, 2(%ecx,%edx) /* store third byte of last word */
+ orb %al, %al /* is it NUL? */
+ jz L(8) /* yes => return */
+
+ movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */
+
+L(8): movl DEST(%esp), %eax /* start address of destination is result */
+ popl %edi /* restore saved register */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (strcat)
+libc_hidden_builtin_def (strcat)
diff --git a/REORG.TODO/sysdeps/i386/strchr.S b/REORG.TODO/sysdeps/i386/strchr.S
new file mode 100644
index 0000000000..6075e77882
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchr.S
@@ -0,0 +1,290 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (strchr)
+
+ pushl %edi /* Save callee-safe registers used here. */
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+ movl STR(%esp), %eax
+ movl CHR(%esp), %edx
+
+ /* At the moment %edx contains C. What we need for the
+ algorithm is C in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* now it is 0|0|c|c */
+ movl %edx, %ecx
+ shll $16, %edx /* now it is c|c|0|0 */
+ movw %cx, %dx /* and finally c|c|c|c */
+
+ /* Before we start with the main loop we process single bytes
+ until the source pointer is aligned. This has two reasons:
+ 1. aligned 32-bit memory access is faster
+ and (more important)
+ 2. we process in the main loop 32 bit in one step although
+ we don't know the end of the string. But accessing at
+ 4-byte alignment guarantees that we never access illegal
+ memory if this would not also be done by the trivial
+ implementation (this is because all processor inherent
+ boundaries are multiples of 4. */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ /* No we have reached alignment. */
+ jmp L(11) /* begin loop */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for C, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is C. This turns each byte that is C
+ into a zero. */
+
+ /* Each round the main loop processes 16 bytes. */
+
+ ALIGN(4)
+
+L(1): addl $16, %eax /* adjust pointer for whole round */
+
+L(11): movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* C */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+ jnc L(7)
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is C we don't get 0 in %edi. */
+ jnz L(7) /* found it => return pointer */
+
+ /* Now we made sure the dword does not contain the character we are
+ looking for. But because we deal with strings we have to check
+ for the end of string before testing the next dword. */
+
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(2) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(2) /* found NUL => return NULL */
+
+ movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* C */
+ jnc L(71) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(71) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(2) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(2) /* found NUL => return NULL */
+
+ movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* C */
+ jnc L(72) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(72) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(2) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(2) /* found NUL => return NULL */
+
+ movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* C */
+ jnc L(73) /* highest byte is C => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(73) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(2) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(1) /* no NUL found => restart loop */
+
+L(2): /* Return NULL. */
+ xorl %eax, %eax
+ popl %edi /* restore saved register content */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+L(73): addl $4, %eax /* adjust pointer */
+L(72): addl $4, %eax
+L(71): addl $4, %eax
+
+ /* We now scan for the byte in which the character was matched.
+ But we have to take care of the case that a NUL char is
+ found before this in the dword. Note that we XORed %ecx
+ with the byte we're looking for, therefore the tests below look
+ reversed. */
+
+L(7): testb %cl, %cl /* is first byte C? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %cl /* is first byte NUL? */
+ je L(2) /* yes => return NULL */
+ incl %eax /* it's not in the first byte */
+
+ testb %ch, %ch /* is second byte C? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %ch /* is second byte NUL? */
+ je L(2) /* yes => return NULL? */
+ incl %eax /* it's not in the second byte */
+
+ shrl $16, %ecx /* make upper byte accessible */
+ testb %cl, %cl /* is third byte C? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %cl /* is third byte NUL? */
+ je L(2) /* yes => return NULL */
+
+ /* It must be in the fourth byte and it cannot be NUL. */
+ incl %eax
+
+L(6):
+ popl %edi /* restore saved register content */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/i386/strchrnul.S b/REORG.TODO/sysdeps/i386/strchrnul.S
new file mode 100644
index 0000000000..800b872c74
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strchrnul.S
@@ -0,0 +1,278 @@
+/* strchrnul (str, chr) -- Return pointer to first occurrence of CHR in STR
+ or the final NUL byte.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.org>
+ Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+4 /* space for 1 saved reg */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (__strchrnul)
+
+ pushl %edi /* Save callee-safe registers used here. */
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+
+ movl STR(%esp), %eax
+ movl CHR(%esp), %edx
+
+ /* At the moment %edx contains CHR. What we need for the
+ algorithm is CHR in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %dl, %dh /* now it is 0|0|c|c */
+ movl %edx, %ecx
+ shll $16, %edx /* now it is c|c|0|0 */
+ movw %cx, %dx /* and finally c|c|c|c */
+
+ /* Before we start with the main loop we process single bytes
+ until the source pointer is aligned. This has two reasons:
+ 1. aligned 32-bit memory access is faster
+ and (more important)
+ 2. we process in the main loop 32 bit in one step although
+ we don't know the end of the string. But accessing at
+ 4-byte alignment guarantees that we never access illegal
+ memory if this would not also be done by the trivial
+ implementation (this is because all processor inherent
+ boundaries are multiples of 4. */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(6) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(6) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ testb $3, %al /* correctly aligned ? */
+ jz L(11) /* yes => begin loop */
+ movb (%eax), %cl /* load byte in question (we need it twice) */
+ cmpb %cl, %dl /* compare byte */
+ je L(6) /* target found => return */
+ testb %cl, %cl /* is NUL? */
+ jz L(6) /* yes => return NULL */
+ incl %eax /* increment pointer */
+
+ /* No we have reached alignment. */
+ jmp L(11) /* begin loop */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for CHR, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is CHR. This turns each byte that is CHR
+ into a zero. */
+
+ /* Each round the main loop processes 16 bytes. */
+
+ ALIGN(4)
+
+L(1): addl $16, %eax /* adjust pointer for whole round */
+
+L(11): movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* CHR */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+ jnc L(7)
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is CHR we don't get 0 in %edi. */
+ jnz L(7) /* found it => return pointer */
+
+ /* Now we made sure the dword does not contain the character we are
+ looking for. But because we deal with strings we have to check
+ for the end of string before testing the next dword. */
+
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(7) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(7) /* found NUL => return NULL */
+
+ movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* CHR */
+ jnc L(71) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(71) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(71) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(71) /* found NUL => return NULL */
+
+ movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* CHR */
+ jnc L(72) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(72) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(72) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(72) /* found NUL => return NULL */
+
+ movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
+ xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* CHR */
+ jnc L(73) /* highest byte is CHR => return pointer */
+ xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(73) /* found it => return pointer */
+ xorl %edx, %ecx /* restore original dword without reload */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %ecx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(73) /* highest byte is NUL => return NULL */
+ xorl %ecx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(1) /* no NUL found => restart loop */
+
+L(73): addl $4, %eax /* adjust pointer */
+L(72): addl $4, %eax
+L(71): addl $4, %eax
+
+ /* We now scan for the byte in which the character was matched.
+ But we have to take care of the case that a NUL char is
+ found before this in the dword. */
+
+L(7): testb %cl, %cl /* is first byte CHR? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %cl /* is first byte NUL? */
+ je L(6) /* yes => return NULL */
+ incl %eax /* it's not in the first byte */
+
+ testb %ch, %ch /* is second byte CHR? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %ch /* is second byte NUL? */
+ je L(6) /* yes => return NULL? */
+ incl %eax /* it's not in the second byte */
+
+ shrl $16, %ecx /* make upper byte accessible */
+ testb %cl, %cl /* is third byte CHR? */
+ jz L(6) /* yes => return pointer */
+ cmpb %dl, %cl /* is third byte NUL? */
+ je L(6) /* yes => return NULL */
+
+ /* It must be in the fourth byte and it cannot be NUL. */
+ incl %eax
+
+L(6): popl %edi /* restore saved register content */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__strchrnul)
+
+weak_alias (__strchrnul, strchrnul)
diff --git a/REORG.TODO/sysdeps/i386/strcspn.S b/REORG.TODO/sysdeps/i386/strcspn.S
new file mode 100644
index 0000000000..c852a3b1e5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strcspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+ which contains no characters from SS.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define STR PARMS
+#define STOP STR+4
+
+ .text
+ENTRY (strcspn)
+
+ movl STR(%esp), %edx
+ movl STOP(%esp), %eax
+
+ /* First we create a table with flags for all possible characters.
+ For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+ supported by the C string functions we have 256 characters.
+ Before inserting marks for the stop characters we clear the whole
+ table. The unrolled form is much faster than a loop. */
+ xorl %ecx, %ecx /* %ecx = 0 !!! */
+
+ pushl %ecx /* make a 256 bytes long block filled with 0 */
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* These immediate values make the label 2 */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* to be aligned on a 16 byte boundary to */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* get a better performance of the loop. */
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+ Although all the following instruction only modify %cl we always
+ have a correct zero-extended 32-bit value in %ecx. */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
+ longer instructions so that the next loop aligns without adding nops. */
+
+L(2): movb (%eax), %cl /* get byte from stopset */
+ testb %cl, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 1(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 2(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 3(%eax), %cl /* get byte from stopset */
+ addl $4, %eax /* increment stopset pointer */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+ testb $0xff, %cl /* is NUL char? */
+ jnz L(2) /* no => process next dword from stopset */
+
+L(1): leal -4(%edx), %eax /* prepare loop */
+
+ /* We use a neat trick for the following loop. Normally we would
+ have to test for two termination conditions
+ 1. a character in the stopset was found
+ and
+ 2. the end of the string was found
+ But as a sign that the character is in the stopset we store its
+ value in the table. But the value of NUL is NUL so the loop
+ terminates for NUL in every case. */
+
+L(3): addl $4, %eax /* adjust pointer for full loop round */
+
+ movb (%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(4) /* yes => return */
+
+ movb 1(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(5) /* yes => return */
+
+ movb 2(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(6) /* yes => return */
+
+ movb 3(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ jne L(3) /* yes => return */
+
+ incl %eax /* adjust pointer */
+L(6): incl %eax
+L(5): incl %eax
+
+L(4): addl $256, %esp /* remove stopset */
+ cfi_adjust_cfa_offset (-256)
+ subl %edx, %eax /* we have to return the number of valid
+ characters, so compute distance to first
+ non-valid character */
+ ret
+END (strcspn)
+libc_hidden_builtin_def (strcspn)
diff --git a/REORG.TODO/sysdeps/i386/string-inlines.c b/REORG.TODO/sysdeps/i386/string-inlines.c
new file mode 100644
index 0000000000..d023bc3aa3
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/string-inlines.c
@@ -0,0 +1,47 @@
+/* Copyright (C) 1999-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is to avoid PLT entries for the x86 version. */
+#define __memcpy_g __memcpy_g_internal
+#define __strchr_g __strchr_g_internal
+#include <string/string-inlines.c>
+
+void *
+(__memcpy_c) (void *d, const void *s, size_t n)
+{
+ return memcpy (d, s, n);
+}
+
+void *
+__memset_cc (void *s, unsigned long int pattern, size_t n)
+{
+ return memset (s, pattern & 0xff, n);
+}
+strong_alias (__memset_cc, __memset_cg)
+
+void *
+__memset_gg (void *s, char c, size_t n)
+{
+ return memset (s, c, n);
+}
+
+#ifdef __memcpy_c
+# undef __memcpy_g
+strong_alias (__memcpy_g_internal, __memcpy_g)
+# undef __strchr_g
+strong_alias (__strchr_g_internal, __strchr_g)
+#endif
diff --git a/REORG.TODO/sysdeps/i386/strlen.S b/REORG.TODO/sysdeps/i386/strlen.S
new file mode 100644
index 0000000000..192fadf20a
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.S
@@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+ Optimized for Intel 80x86, x>=4.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define STR PARMS
+
+ .text
+ENTRY (strlen)
+
+ movl STR(%esp), %ecx
+ movl %ecx, %eax /* duplicate it */
+
+ andl $3, %ecx /* mask alignment bits */
+ jz L(1) /* aligned => start loop */
+ cmpb %ch, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+ incl %eax /* increment pointer */
+
+ xorl $3, %ecx /* was alignment = 3? */
+ jz L(1) /* yes => now it is aligned and start loop */
+ cmpb %ch, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+ addl $1, %eax /* increment pointer */
+
+ subl $1, %ecx /* was alignment = 2? */
+ jz L(1) /* yes => now it is aligned and start loop */
+ cmpb %ch, (%eax) /* is byte NUL? */
+ je L(2) /* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+ and `decl %ecx' resp. The additional two byte per instruction make the
+ label 4 to be aligned on a 16 byte boundary with nops.
+
+ The following `sub $15, %eax' is part of this trick, too. Together with
+ the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+ as expected from the algorithm. But doing so has the advantage that
+ no jump to label 1 is necessary and so the pipeline is not flushed. */
+
+ subl $15, %eax /* effectively +1 */
+
+
+L(4): addl $16, %eax /* adjust pointer for full loop */
+
+L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edx /* magic value */
+ addl %ecx, %edx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(3) /* highest byte is NUL => return pointer */
+ xorl %ecx, %edx /* (word+magic)^word */
+ orl $0xfefefeff, %edx /* set all non-carry bits */
+ incl %edx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(3) /* found NUL => return pointer */
+
+ movl 4(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edx /* magic value */
+ addl %ecx, %edx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(5) /* highest byte is NUL => return pointer */
+ xorl %ecx, %edx /* (word+magic)^word */
+ orl $0xfefefeff, %edx /* set all non-carry bits */
+ incl %edx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(5) /* found NUL => return pointer */
+
+ movl 8(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edx /* magic value */
+ addl %ecx, %edx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(6) /* highest byte is NUL => return pointer */
+ xorl %ecx, %edx /* (word+magic)^word */
+ orl $0xfefefeff, %edx /* set all non-carry bits */
+ incl %edx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(6) /* found NUL => return pointer */
+
+ movl 12(%eax), %ecx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edx /* magic value */
+ addl %ecx, %edx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(7) /* highest byte is NUL => return pointer */
+ xorl %ecx, %edx /* (word+magic)^word */
+ orl $0xfefefeff, %edx /* set all non-carry bits */
+ incl %edx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(4) /* no NUL found => continue loop */
+
+L(7): addl $4, %eax /* adjust pointer */
+L(6): addl $4, %eax
+L(5): addl $4, %eax
+
+L(3): testb %cl, %cl /* is first byte NUL? */
+ jz L(2) /* yes => return */
+ incl %eax /* increment pointer */
+
+ testb %ch, %ch /* is second byte NUL? */
+ jz L(2) /* yes => return */
+ incl %eax /* increment pointer */
+
+ testl $0xff0000, %ecx /* is third byte NUL? */
+ jz L(2) /* yes => return pointer */
+ incl %eax /* increment pointer */
+
+L(2): subl STR(%esp), %eax /* compute difference to string start */
+
+ ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strlen.c b/REORG.TODO/sysdeps/i386/strlen.c
new file mode 100644
index 0000000000..0b69957392
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strlen.c
@@ -0,0 +1,35 @@
+/* Determine the length of a string. For Intel 80x86, x>=3.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Torbjorn Granlund (tege@sics.se).
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+size_t
+strlen (const char *str)
+{
+ int cnt;
+
+ asm("cld\n" /* Search forward. */
+ /* Some old versions of gas need `repne' instead of `repnz'. */
+ "repnz\n" /* Look for a zero byte. */
+ "scasb" /* %0, %1, %3 */ :
+ "=c" (cnt) : "D" (str), "0" (-1), "a" (0));
+
+ return -2 - cnt;
+}
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/i386/strpbrk.S b/REORG.TODO/sysdeps/i386/strpbrk.S
new file mode 100644
index 0000000000..1109b233da
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strpbrk.S
@@ -0,0 +1,243 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+ which contains no characters from SS.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define RTN PARMS
+#define STR RTN
+#define STOP STR+4
+
+ .text
+ENTRY (strpbrk)
+
+ movl STR(%esp), %edx
+ movl STOP(%esp), %eax
+
+ /* First we create a table with flags for all possible characters.
+ For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+ supported by the C string functions we have 256 characters.
+ Before inserting marks for the stop characters we clear the whole
+ table. The unrolled form is much faster than a loop. */
+ xorl %ecx, %ecx /* %ecx = 0 !!! */
+
+ pushl %ecx /* make a 256 bytes long block filled with 0 */
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* These immediate values make the label 2 */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* to be aligned on a 16 byte boundary to */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* get a better performance of the loop. */
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+ Although all the following instruction only modify %cl we always
+ have a correct zero-extended 32-bit value in %ecx. */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
+ longer instructions so that the next loop aligns without adding nops. */
+
+L(2): movb (%eax), %cl /* get byte from stopset */
+ testb %cl, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 1(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 2(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 3(%eax), %cl /* get byte from stopset */
+ addl $4, %eax /* increment stopset pointer */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+ testb $0xff, %cl /* is NUL char? */
+ jnz L(2) /* no => process next dword from stopset */
+
+L(1): leal -4(%edx), %eax /* prepare loop */
+
+ /* We use a neat trick for the following loop. Normally we would
+ have to test for two termination conditions
+ 1. a character in the stopset was found
+ and
+ 2. the end of the string was found
+ But as a sign that the character is in the stopset we store its
+ value in the table. But the value of NUL is NUL so the loop
+ terminates for NUL in every case. */
+
+L(3): addl $4, %eax /* adjust pointer for full loop round */
+
+ movb (%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(4) /* yes => return */
+
+ movb 1(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(5) /* yes => return */
+
+ movb 2(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ je L(6) /* yes => return */
+
+ movb 3(%eax), %cl /* get byte from string */
+ cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */
+ jne L(3) /* yes => return */
+
+ incl %eax /* adjust pointer */
+L(6): incl %eax
+L(5): incl %eax
+
+L(4): addl $256, %esp /* remove stopset */
+ cfi_adjust_cfa_offset (-256)
+
+ orb %cl, %cl /* was last character NUL? */
+ jnz L(7) /* no => return pointer */
+ xorl %eax, %eax
+
+L(7): ret
+END (strpbrk)
+libc_hidden_builtin_def (strpbrk)
diff --git a/REORG.TODO/sysdeps/i386/strrchr.S b/REORG.TODO/sysdeps/i386/strrchr.S
new file mode 100644
index 0000000000..95b304dc0b
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strrchr.S
@@ -0,0 +1,334 @@
+/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RTN PARMS
+#define STR RTN
+#define CHR STR+4
+
+ .text
+ENTRY (strrchr)
+
+ pushl %edi /* Save callee-safe registers used here. */
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 0)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ xorl %eax, %eax
+ movl STR(%esp), %esi
+ cfi_rel_offset (esi, 0)
+ movl CHR(%esp), %ecx
+
+ /* At the moment %ecx contains C. What we need for the
+ algorithm is C in all bytes of the dword. Avoid
+ operations on 16 bit words because these require an
+ prefix byte (and one more cycle). */
+ movb %cl, %ch /* now it is 0|0|c|c */
+ movl %ecx, %edx
+ shll $16, %ecx /* now it is c|c|0|0 */
+ movw %dx, %cx /* and finally c|c|c|c */
+
+ /* Before we start with the main loop we process single bytes
+ until the source pointer is aligned. This has two reasons:
+ 1. aligned 32-bit memory access is faster
+ and (more important)
+ 2. we process in the main loop 32 bit in one step although
+ we don't know the end of the string. But accessing at
+ 4-byte alignment guarantees that we never access illegal
+ memory if this would not also be done by the trivial
+ implementation (this is because all processor inherent
+ boundaries are multiples of 4. */
+
+ testl $3, %esi /* correctly aligned ? */
+ jz L(19) /* yes => begin loop */
+ movb (%esi), %dl /* load byte in question (we need it twice) */
+ cmpb %dl, %cl /* compare byte */
+ jne L(11) /* target found => return */
+ movl %esi, %eax /* remember pointer as possible result */
+L(11): orb %dl, %dl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %esi /* increment pointer */
+
+ testl $3, %esi /* correctly aligned ? */
+ jz L(19) /* yes => begin loop */
+ movb (%esi), %dl /* load byte in question (we need it twice) */
+ cmpb %dl, %cl /* compare byte */
+ jne L(12) /* target found => return */
+ movl %esi, %eax /* remember pointer as result */
+L(12): orb %dl, %dl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %esi /* increment pointer */
+
+ testl $3, %esi /* correctly aligned ? */
+ jz L(19) /* yes => begin loop */
+ movb (%esi), %dl /* load byte in question (we need it twice) */
+ cmpb %dl, %cl /* compare byte */
+ jne L(13) /* target found => return */
+ movl %esi, %eax /* remember pointer as result */
+L(13): orb %dl, %dl /* is NUL? */
+ jz L(2) /* yes => return NULL */
+ incl %esi /* increment pointer */
+
+ /* No we have reached alignment. */
+ jmp L(19) /* begin loop */
+
+ /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+ change any of the hole bits of LONGWORD.
+
+ 1) Is this safe? Will it catch all the zero bytes?
+ Suppose there is a byte with all zeros. Any carry bits
+ propagating from its left will fall into the hole at its
+ least significant bit and stop. Since there will be no
+ carry from its most significant bit, the LSB of the
+ byte to the left will be unchanged, and the zero will be
+ detected.
+
+ 2) Is this worthwhile? Will it ignore everything except
+ zero bytes? Suppose every byte of LONGWORD has a bit set
+ somewhere. There will be a carry into bit 8. If bit 8
+ is set, this will carry into bit 16. If bit 8 is clear,
+ one of bits 9-15 must be set, so there will be a carry
+ into bit 16. Similarly, there will be a carry into bit
+ 24. If one of bits 24-31 is set, there will be a carry
+ into bit 32 (=carry flag), so all of the hole bits will
+ be changed.
+
+ 3) But wait! Aren't we looking for C, not zero?
+ Good point. So what we do is XOR LONGWORD with a longword,
+ each of whose bytes is C. This turns each byte that is C
+ into a zero. */
+
+ /* Each round the main loop processes 16 bytes. */
+
+ /* Jump to here when the character is detected. We chose this
+ way around because the character one is looking for is not
+ as frequent as the rest and taking a conditional jump is more
+ expensive than ignoring it.
+
+ Some more words to the code below: it might not be obvious why
+ we decrement the source pointer here. In the loop the pointer
+ is not pre-incremented and so it still points before the word
+ we are looking at. But you should take a look at the instruction
+ which gets executed before we get into the loop: `addl $16, %esi'.
+ This makes the following subs into adds. */
+
+ /* These fill bytes make the main loop be correctly aligned.
+ We cannot use align because it is not the following instruction
+ which should be aligned. */
+ .byte 0, 0
+#ifndef PROF
+ /* Profiling adds some code and so changes the alignment. */
+ .byte 0
+#endif
+
+L(4): subl $4, %esi /* adjust pointer */
+L(41): subl $4, %esi
+L(42): subl $4, %esi
+L(43): testl $0xff000000, %edx /* is highest byte == C? */
+ jnz L(33) /* no => try other bytes */
+ leal 15(%esi), %eax /* store address as result */
+ jmp L(1) /* and start loop again */
+
+L(3): subl $4, %esi /* adjust pointer */
+L(31): subl $4, %esi
+L(32): subl $4, %esi
+L(33): testl $0xff0000, %edx /* is C in third byte? */
+ jnz L(51) /* no => try other bytes */
+ leal 14(%esi), %eax /* store address as result */
+ jmp L(1) /* and start loop again */
+
+L(51):
+ /* At this point we know that the byte is in one of the lower bytes.
+ We make a guess and correct it if necessary. This reduces the
+ number of necessary jumps. */
+ leal 12(%esi), %eax /* guess address of lowest byte as result */
+ testb %dh, %dh /* is guess correct? */
+ jnz L(1) /* yes => start loop */
+ leal 13(%esi), %eax /* correct guess to second byte */
+
+L(1): addl $16, %esi /* increment pointer for full round */
+
+L(19): movl (%esi), %edx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+
+ /* According to the algorithm we had to reverse the effect of the
+ XOR first and then test the overflow bits. But because the
+ following XOR would destroy the carry flag and it would (in a
+ representation with more than 32 bits) not alter then last
+ overflow, we can now test this condition. If no carry is signaled
+ no overflow must have occurred in the last byte => it was 0. */
+
+ jnc L(20) /* found NUL => check last word */
+
+ /* We are only interested in carry bits that change due to the
+ previous add, so remove original bits */
+ xorl %edx, %edi /* (word+magic)^word */
+
+ /* Now test for the other three overflow bits. */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ /* If at least one byte of the word is C we don't get 0 in %edi. */
+ jnz L(20) /* found NUL => check last word */
+
+ /* Now we made sure the dword does not contain the character we are
+ looking for. But because we deal with strings we have to check
+ for the end of string before testing the next dword. */
+
+ xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(4) /* highest byte is C => examine dword */
+ xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(3) /* C is detected in the word => examine it */
+
+ movl 4(%esi), %edx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(21) /* found NUL => check last word */
+ xorl %edx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(21) /* found NUL => check last word */
+ xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(41) /* highest byte is C => examine dword */
+ xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(31) /* C is detected in the word => examine it */
+
+ movl 8(%esi), %edx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(22) /* found NUL => check last word */
+ xorl %edx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(22) /* found NUL => check last word */
+ xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(42) /* highest byte is C => examine dword */
+ xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(32) /* C is detected in the word => examine it */
+
+ movl 12(%esi), %edx /* get word (= 4 bytes) in question */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(23) /* found NUL => check last word */
+ xorl %edx, %edi /* (word+magic)^word */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz L(23) /* found NUL => check last word */
+ xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c
+ are now 0 */
+ movl $0xfefefeff, %edi /* magic value */
+ addl %edx, %edi /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc L(43) /* highest byte is C => examine dword */
+ xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */
+ orl $0xfefefeff, %edi /* set all non-carry bits */
+ incl %edi /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz L(1) /* C is not detected => restart loop */
+ jmp L(33) /* examine word */
+
+L(23): addl $4, %esi /* adjust pointer */
+L(22): addl $4, %esi
+L(21): addl $4, %esi
+
+ /* What remains to do is to test which byte the NUL char is and
+ whether the searched character appears in one of the bytes
+ before. A special case is that the searched byte maybe NUL.
+ In this case a pointer to the terminating NUL char has to be
+ returned. */
+
+L(20): cmpb %cl, %dl /* is first byte == C? */
+ jne L(24) /* no => skip */
+ movl %esi, %eax /* store address as result */
+L(24): testb %dl, %dl /* is first byte == NUL? */
+ jz L(2) /* yes => return */
+
+ cmpb %cl, %dh /* is second byte == C? */
+ jne L(25) /* no => skip */
+ leal 1(%esi), %eax /* store address as result */
+L(25): testb %dh, %dh /* is second byte == NUL? */
+ jz L(2) /* yes => return */
+
+ shrl $16,%edx /* make upper bytes accessible */
+ cmpb %cl, %dl /* is third byte == C */
+ jne L(26) /* no => skip */
+ leal 2(%esi), %eax /* store address as result */
+L(26): testb %dl, %dl /* is third byte == NUL */
+ jz L(2) /* yes => return */
+
+ cmpb %cl, %dh /* is fourth byte == C */
+ jne L(2) /* no => skip */
+ leal 3(%esi), %eax /* store address as result */
+
+L(2): popl %esi /* restore saved register content */
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (strrchr)
+
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/i386/strspn.S b/REORG.TODO/sysdeps/i386/strspn.S
new file mode 100644
index 0000000000..d433eb6af5
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/strspn.S
@@ -0,0 +1,240 @@
+/* strcspn (str, ss) -- Return the length of the initial segment of STR
+ which contains only characters from SS.
+ For Intel 80x86, x>=3.
+ Copyright (C) 1994-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+ Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define PARMS 4 /* no space for saved regs */
+#define STR PARMS
+#define SKIP STR+4
+
+ .text
+ENTRY (strspn)
+
+ movl STR(%esp), %edx
+ movl SKIP(%esp), %eax
+
+ /* First we create a table with flags for all possible characters.
+ For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+ supported by the C string functions we have 256 characters.
+ Before inserting marks for the stop characters we clear the whole
+ table. The unrolled form is much faster than a loop. */
+ xorl %ecx, %ecx /* %ecx = 0 !!! */
+
+ pushl %ecx /* make a 256 bytes long block filled with 0 */
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl %ecx
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* These immediate values make the label 2 */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* to be aligned on a 16 byte boundary to */
+ cfi_adjust_cfa_offset (4)
+ pushl $0 /* get a better performance of the loop. */
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+ pushl $0
+ cfi_adjust_cfa_offset (4)
+
+/* For understanding the following code remember that %ecx == 0 now.
+ Although all the following instruction only modify %cl we always
+ have a correct zero-extended 32-bit value in %ecx. */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want
+ longer instructions so that the next loop aligns without adding nops. */
+
+L(2): movb (%eax), %cl /* get byte from stopset */
+ testb %cl, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 1(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 2(%eax), %cl /* get byte from stopset */
+ testb $0xff, %cl /* is NUL char? */
+ jz L(1) /* yes => start compare loop */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+
+ movb 3(%eax), %cl /* get byte from stopset */
+ addl $4, %eax /* increment stopset pointer */
+ movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */
+ testb $0xff, %cl /* is NUL char? */
+ jnz L(2) /* no => process next dword from stopset */
+
+L(1): leal -4(%edx), %eax /* prepare loop */
+
+ /* We use a neat trick for the following loop. Normally we would
+ have to test for two termination conditions
+ 1. a character in the stopset was found
+ and
+ 2. the end of the string was found
+ But as a sign that the character is in the stopset we store its
+ value in the table. But the value of NUL is NUL so the loop
+ terminates for NUL in every case. */
+
+L(3): addl $4, %eax /* adjust pointer for full loop round */
+
+ movb (%eax), %cl /* get byte from string */
+ testb %cl, (%esp,%ecx) /* is it contained in skipset? */
+ jz L(4) /* no => return */
+
+ movb 1(%eax), %cl /* get byte from string */
+ testb %cl, (%esp,%ecx) /* is it contained in skipset? */
+ jz L(5) /* no => return */
+
+ movb 2(%eax), %cl /* get byte from string */
+ testb %cl, (%esp,%ecx) /* is it contained in skipset? */
+ jz L(6) /* no => return */
+
+ movb 3(%eax), %cl /* get byte from string */
+ testb %cl, (%esp,%ecx) /* is it contained in skipset? */
+ jnz L(3) /* yes => start loop again */
+
+ incl %eax /* adjust pointer */
+L(6): incl %eax
+L(5): incl %eax
+
+L(4): addl $256, %esp /* remove stopset */
+ cfi_adjust_cfa_offset (-256)
+ subl %edx, %eax /* we have to return the number of valid
+ characters, so compute distance to first
+ non-valid character */
+ ret
+END (strspn)
+libc_hidden_builtin_def (strspn)
diff --git a/REORG.TODO/sysdeps/i386/sub_n.S b/REORG.TODO/sysdeps/i386/sub_n.S
new file mode 100644
index 0000000000..3649da29e7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sub_n.S
@@ -0,0 +1,111 @@
+/* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+ sum in a third limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+8 /* space for 2 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define S2 S1+4
+#define SIZE S2+4
+
+ .text
+ENTRY (__mpn_sub_n)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+
+ movl RES(%esp),%edi
+ cfi_rel_offset (edi, 4)
+ movl S1(%esp),%esi
+ cfi_rel_offset (esi, 0)
+ movl S2(%esp),%edx
+ movl SIZE(%esp),%ecx
+ movl %ecx,%eax
+ shrl $3,%ecx /* compute count for unrolled loop */
+ negl %eax
+ andl $7,%eax /* get index where to start loop */
+ jz L(oop) /* necessary special case for 0 */
+ incl %ecx /* adjust loop count */
+ shll $2,%eax /* adjustment for pointers... */
+ subl %eax,%edi /* ... since they are offset ... */
+ subl %eax,%esi /* ... by a constant when we ... */
+ subl %eax,%edx /* ... enter the loop */
+ shrl $2,%eax /* restore previous value */
+#ifdef PIC
+/* Calculate start address in loop for PIC. Due to limitations in some
+ assemblers, Loop-L0-3 cannot be put into the leal */
+ call L(0)
+ cfi_adjust_cfa_offset (4)
+L(0): leal (%eax,%eax,8),%eax
+ addl (%esp),%eax
+ addl $(L(oop)-L(0)-3),%eax
+ addl $4,%esp
+ cfi_adjust_cfa_offset (-4)
+#else
+/* Calculate start address in loop for non-PIC. */
+ leal (L(oop) - 3)(%eax,%eax,8),%eax
+#endif
+ jmp *%eax /* jump into loop */
+ ALIGN (3)
+L(oop): movl (%esi),%eax
+ sbbl (%edx),%eax
+ movl %eax,(%edi)
+ movl 4(%esi),%eax
+ sbbl 4(%edx),%eax
+ movl %eax,4(%edi)
+ movl 8(%esi),%eax
+ sbbl 8(%edx),%eax
+ movl %eax,8(%edi)
+ movl 12(%esi),%eax
+ sbbl 12(%edx),%eax
+ movl %eax,12(%edi)
+ movl 16(%esi),%eax
+ sbbl 16(%edx),%eax
+ movl %eax,16(%edi)
+ movl 20(%esi),%eax
+ sbbl 20(%edx),%eax
+ movl %eax,20(%edi)
+ movl 24(%esi),%eax
+ sbbl 24(%edx),%eax
+ movl %eax,24(%edi)
+ movl 28(%esi),%eax
+ sbbl 28(%edx),%eax
+ movl %eax,28(%edi)
+ leal 32(%edi),%edi
+ leal 32(%esi),%esi
+ leal 32(%edx),%edx
+ decl %ecx
+ jnz L(oop)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_sub_n)
diff --git a/REORG.TODO/sysdeps/i386/submul_1.S b/REORG.TODO/sysdeps/i386/submul_1.S
new file mode 100644
index 0000000000..c765e8dd79
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/submul_1.S
@@ -0,0 +1,86 @@
+/* i80386 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+ the result from a second limb vector.
+ Copyright (C) 1992-2017 Free Software Foundation, Inc.
+ This file is part of the GNU MP Library.
+
+ The GNU MP Library is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or (at your
+ option) any later version.
+
+ The GNU MP Library is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with the GNU MP Library; see the file COPYING.LIB. If not,
+ see <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define PARMS 4+16 /* space for 4 saved regs */
+#define RES PARMS
+#define S1 RES+4
+#define SIZE S1+4
+#define S2LIMB SIZE+4
+
+#define res_ptr edi
+#define s1_ptr esi
+#define sizeP ecx
+#define s2_limb ebx
+
+ .text
+ENTRY (__mpn_submul_1)
+
+ pushl %edi
+ cfi_adjust_cfa_offset (4)
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ pushl %ebp
+ cfi_adjust_cfa_offset (4)
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (edi, 12)
+ cfi_rel_offset (esi, 8)
+ cfi_rel_offset (ebp, 4)
+ cfi_rel_offset (ebx, 0)
+
+ movl RES(%esp), %res_ptr
+ movl S1(%esp), %s1_ptr
+ movl SIZE(%esp), %sizeP
+ movl S2LIMB(%esp), %s2_limb
+ leal (%res_ptr,%sizeP,4), %res_ptr
+ leal (%s1_ptr,%sizeP,4), %s1_ptr
+ negl %sizeP
+ xorl %ebp, %ebp
+ ALIGN (3)
+L(oop):
+ movl (%s1_ptr,%sizeP,4), %eax
+ mull %s2_limb
+ addl %ebp, %eax
+ adcl $0, %edx
+ subl %eax, (%res_ptr,%sizeP,4)
+ adcl $0, %edx
+ movl %edx, %ebp
+
+ incl %sizeP
+ jnz L(oop)
+ movl %ebp, %eax
+
+ popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ popl %ebp
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebp)
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (esi)
+ popl %edi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (edi)
+
+ ret
+END (__mpn_submul_1)
diff --git a/REORG.TODO/sysdeps/i386/symbol-hacks.h b/REORG.TODO/sysdeps/i386/symbol-hacks.h
new file mode 100644
index 0000000000..36a13c83f7
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/symbol-hacks.h
@@ -0,0 +1,21 @@
+/* Hacks needed for symbol manipulation. i386 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
+
+#include_next "symbol-hacks.h"
diff --git a/REORG.TODO/sysdeps/i386/sys/ucontext.h b/REORG.TODO/sysdeps/i386/sys/ucontext.h
new file mode 100644
index 0000000000..fb5df11965
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sys/ucontext.h
@@ -0,0 +1,139 @@
+/* Copyright (C) 1997-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* System V/i386 ABI compliant context switching support. */
+
+#ifndef _SYS_UCONTEXT_H
+#define _SYS_UCONTEXT_H 1
+
+#include <features.h>
+
+#include <bits/types/sigset_t.h>
+#include <bits/sigcontext.h>
+#include <bits/types/stack_t.h>
+
+
+/* Type for general register. */
+typedef int greg_t;
+
+/* Number of general registers. */
+#define __NGREG 19
+#ifdef __USE_MISC
+# define NGREG __NGREG
+#endif
+
+/* Container for all general registers. */
+typedef greg_t gregset_t[__NGREG];
+
+#ifdef __USE_MISC
+/* Number of each register is the `gregset_t' array. */
+enum
+{
+ REG_GS = 0,
+# define REG_GS REG_GS
+ REG_FS,
+# define REG_FS REG_FS
+ REG_ES,
+# define REG_ES REG_ES
+ REG_DS,
+# define REG_DS REG_DS
+ REG_EDI,
+# define REG_EDI REG_EDI
+ REG_ESI,
+# define REG_ESI REG_ESI
+ REG_EBP,
+# define REG_EBP REG_EBP
+ REG_ESP,
+# define REG_ESP REG_ESP
+ REG_EBX,
+# define REG_EBX REG_EBX
+ REG_EDX,
+# define REG_EDX REG_EDX
+ REG_ECX,
+# define REG_ECX REG_ECX
+ REG_EAX,
+# define REG_EAX REG_EAX
+ REG_TRAPNO,
+# define REG_TRAPNO REG_TRAPNO
+ REG_ERR,
+# define REG_ERR REG_ERR
+ REG_EIP,
+# define REG_EIP REG_EIP
+ REG_CS,
+# define REG_CS REG_CS
+ REG_EFL,
+# define REG_EFL REG_EFL
+ REG_UESP,
+# define REG_UESP REG_UESP
+ REG_SS
+# define REG_SS REG_SS
+};
+#endif
+
+#ifdef __USE_MISC
+# define __ctx(fld) fld
+# define __ctxt(tag) tag
+#else
+# define __ctx(fld) __ ## fld
+# define __ctxt(tag) /* Empty. */
+#endif
+
+/* Structure to describe FPU registers. */
+typedef struct fpregset
+ {
+ union
+ {
+ struct __ctxt(fpchip_state)
+ {
+ int __ctx(state)[27];
+ int __ctx(status);
+ } __ctx(fpchip_state);
+
+ struct __ctxt(fp_emul_space)
+ {
+ char __ctx(fp_emul)[246];
+ char __ctx(fp_epad)[2];
+ } __ctx(fp_emul_space);
+
+ int __ctx(f_fpregs)[62];
+ } __ctx(fp_reg_set);
+
+ long int __ctx(f_wregs)[33];
+ } fpregset_t;
+
+/* Context to describe whole processor state. */
+typedef struct
+ {
+ gregset_t __ctx(gregs);
+ fpregset_t __ctx(fpregs);
+ } mcontext_t;
+
+#undef __ctx
+#undef __ctxt
+
+/* Userlevel context. */
+typedef struct ucontext
+ {
+ unsigned long int uc_flags;
+ struct ucontext *uc_link;
+ sigset_t uc_sigmask;
+ stack_t uc_stack;
+ mcontext_t uc_mcontext;
+ long int uc_filler[5];
+ } ucontext_t;
+
+#endif /* sys/ucontext.h */
diff --git a/REORG.TODO/sysdeps/i386/sysdep.h b/REORG.TODO/sysdeps/i386/sysdep.h
new file mode 100644
index 0000000000..d2b0860b99
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/sysdep.h
@@ -0,0 +1,159 @@
+/* Assembler macros for i386.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdeps/generic/sysdep.h>
+
+#include <features.h> /* For __GNUC_PREREQ. */
+
+/* It is desirable that the names of PIC thunks match those used by
+ GCC so that multiple copies are eliminated by the linker. Because
+ GCC 4.6 and earlier use __i686 in the names, it is necessary to
+ override that predefined macro. */
+#if defined __i686 && defined __ASSEMBLER__
+#undef __i686
+#define __i686 __i686
+#endif
+
+#ifdef __ASSEMBLER__
+# define GET_PC_THUNK(reg) __x86.get_pc_thunk.reg
+#else
+# define GET_PC_THUNK_STR(reg) "__x86.get_pc_thunk." #reg
+#endif
+
+#ifdef __ASSEMBLER__
+
+/* Syntactic details of assembler. */
+
+/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */
+#define ALIGNARG(log2) 1<<log2
+#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+
+
+/* Define an entry point visible from C.
+
+ There is currently a bug in gdb which prevents us from specifying
+ incomplete stabs information. Fake some entries here which specify
+ the current source file. */
+#define ENTRY(name) \
+ .globl C_SYMBOL_NAME(name); \
+ .type C_SYMBOL_NAME(name),@function; \
+ .align ALIGNARG(4); \
+ C_LABEL(name) \
+ cfi_startproc; \
+ CALL_MCOUNT
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ ASM_SIZE_DIRECTIVE(name)
+
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
+/* If compiled for profiling, call `mcount' at the start of each function. */
+#ifdef PROF
+/* The mcount code relies on a normal frame pointer being on the stack
+ to locate our caller, so push one just for its benefit. */
+#define CALL_MCOUNT \
+ pushl %ebp; cfi_adjust_cfa_offset (4); movl %esp, %ebp; \
+ cfi_def_cfa_register (ebp); call JUMPTARGET(mcount); \
+ popl %ebp; cfi_def_cfa (esp, 4);
+#else
+#define CALL_MCOUNT /* Do nothing. */
+#endif
+
+/* Since C identifiers are not normally prefixed with an underscore
+ on this system, the asm identifier `syscall_error' intrudes on the
+ C name space. Make sure we use an innocuous name. */
+#define syscall_error __syscall_error
+#define mcount _mcount
+
+#define PSEUDO(name, syscall_name, args) \
+ .globl syscall_error; \
+lose: SYSCALL_PIC_SETUP \
+ jmp JUMPTARGET(syscall_error); \
+ ENTRY (name) \
+ DO_CALL (syscall_name, args); \
+ jb lose
+
+#undef PSEUDO_END
+#define PSEUDO_END(name) \
+ END (name)
+
+# define SETUP_PIC_REG(reg) \
+ .ifndef GET_PC_THUNK(reg); \
+ .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits; \
+ .globl GET_PC_THUNK(reg); \
+ .hidden GET_PC_THUNK(reg); \
+ .p2align 4; \
+ .type GET_PC_THUNK(reg),@function; \
+GET_PC_THUNK(reg): \
+ movl (%esp), %e##reg; \
+ ret; \
+ .size GET_PC_THUNK(reg), . - GET_PC_THUNK(reg); \
+ .previous; \
+ .endif; \
+ call GET_PC_THUNK(reg)
+
+# define LOAD_PIC_REG(reg) \
+ SETUP_PIC_REG(reg); addl $_GLOBAL_OFFSET_TABLE_, %e##reg
+
+#undef JUMPTARGET
+#ifdef PIC
+#define JUMPTARGET(name) name##@PLT
+#define SYSCALL_PIC_SETUP \
+ pushl %ebx; \
+ cfi_adjust_cfa_offset (4); \
+ call 0f; \
+0: popl %ebx; \
+ cfi_adjust_cfa_offset (-4); \
+ addl $_GLOBAL_OFFSET_TABLE_+[.-0b], %ebx;
+
+#else
+#define JUMPTARGET(name) name
+#define SYSCALL_PIC_SETUP /* Nothing. */
+#endif
+
+/* Local label name for asm code. */
+#ifndef L
+#define L(name) .L##name
+#endif
+
+#define atom_text_section .section ".text.atom", "ax"
+
+#else /* __ASSEMBLER__ */
+
+# define SETUP_PIC_REG_STR(reg) \
+ ".ifndef " GET_PC_THUNK_STR (reg) "\n" \
+ ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
+ ".globl " GET_PC_THUNK_STR (reg) "\n" \
+ ".hidden " GET_PC_THUNK_STR (reg) "\n" \
+ ".p2align 4\n" \
+ ".type " GET_PC_THUNK_STR (reg) ",@function\n" \
+GET_PC_THUNK_STR (reg) ":" \
+ "movl (%%esp), %%e" #reg "\n" \
+ "ret\n" \
+ ".size " GET_PC_THUNK_STR (reg) ", . - " GET_PC_THUNK_STR (reg) "\n" \
+ ".previous\n" \
+ ".endif\n" \
+ "call " GET_PC_THUNK_STR (reg)
+
+# define LOAD_PIC_REG_STR(reg) \
+ SETUP_PIC_REG_STR (reg) "\naddl $_GLOBAL_OFFSET_TABLE_, %%e" #reg
+
+#endif /* __ASSEMBLER__ */
diff --git a/REORG.TODO/sysdeps/i386/tls-macros.h b/REORG.TODO/sysdeps/i386/tls-macros.h
new file mode 100644
index 0000000000..053cba05d1
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tls-macros.h
@@ -0,0 +1,78 @@
+#include <features.h> /* For __GNUC_PREREQ. */
+
+#define TLS_LE(x) \
+ ({ int *__l; \
+ asm ("movl %%gs:0,%0\n\t" \
+ "subl $" #x "@tpoff,%0" \
+ : "=r" (__l)); \
+ __l; })
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_IE(x) \
+ ({ int *__l; \
+ asm ("movl %%gs:0,%0\n\t" \
+ "subl " #x "@gottpoff(%%ebx),%0" \
+ : "=r" (__l)); \
+ __l; })
+#else
+# define TLS_IE(x) \
+ ({ int *__l, __b; \
+ asm ("call 1f\n\t" \
+ ".subsection 1\n" \
+ "1:\tmovl (%%esp), %%ebx\n\t" \
+ "ret\n\t" \
+ ".previous\n\t" \
+ "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \
+ "movl %%gs:0,%0\n\t" \
+ "subl " #x "@gottpoff(%%ebx),%0" \
+ : "=r" (__l), "=&b" (__b)); \
+ __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_LD(x) \
+ ({ int *__l, __c, __d; \
+ asm ("leal " #x "@tlsldm(%%ebx),%%eax\n\t" \
+ "call ___tls_get_addr@plt\n\t" \
+ "leal " #x "@dtpoff(%%eax), %%eax" \
+ : "=a" (__l), "=&c" (__c), "=&d" (__d)); \
+ __l; })
+#else
+# define TLS_LD(x) \
+ ({ int *__l, __b, __c, __d; \
+ asm ("call 1f\n\t" \
+ ".subsection 1\n" \
+ "1:\tmovl (%%esp), %%ebx\n\t" \
+ "ret\n\t" \
+ ".previous\n\t" \
+ "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \
+ "leal " #x "@tlsldm(%%ebx),%%eax\n\t" \
+ "call ___tls_get_addr@plt\n\t" \
+ "leal " #x "@dtpoff(%%eax), %%eax" \
+ : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d)); \
+ __l; })
+#endif
+
+#if defined PIC && !__GNUC_PREREQ (5,0)
+# define TLS_GD(x) \
+ ({ int *__l, __c, __d; \
+ asm ("leal " #x "@tlsgd(%%ebx),%%eax\n\t" \
+ "call ___tls_get_addr@plt\n\t" \
+ "nop" \
+ : "=a" (__l), "=&c" (__c), "=&d" (__d)); \
+ __l; })
+#else
+# define TLS_GD(x) \
+ ({ int *__l, __b, __c, __d; \
+ asm ("call 1f\n\t" \
+ ".subsection 1\n" \
+ "1:\tmovl (%%esp), %%ebx\n\t" \
+ "ret\n\t" \
+ ".previous\n\t" \
+ "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \
+ "leal " #x "@tlsgd(%%ebx),%%eax\n\t" \
+ "call ___tls_get_addr@plt\n\t" \
+ "nop" \
+ : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d)); \
+ __l; })
+#endif
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.c b/REORG.TODO/sysdeps/i386/tlsdesc.c
new file mode 100644
index 0000000000..90de2bb05e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.c
@@ -0,0 +1,268 @@
+/* Manage TLS descriptors. i386 version.
+ Copyright (C) 2005-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <link.h>
+#include <ldsodefs.h>
+#include <elf/dynamic-link.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* The following 4 functions take an entry_check_offset argument.
+ It's computed by the caller as an offset between its entry point
+ and the call site, such that by adding the built-in return address
+ that is implicitly passed to the function with this offset, we can
+ easily obtain the caller's entry point to compare with the entry
+ point given in the TLS descriptor. If it's changed, we want to
+ return immediately. */
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+ that reference the *ABS* segment in their own link maps. The
+ argument is the addend originally stored there. */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_abs_plus_addend_fixup (struct tlsdesc volatile *td,
+ struct link_map *l,
+ ptrdiff_t entry_check_offset)
+{
+ ptrdiff_t addend = (ptrdiff_t) td->arg;
+
+ if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+ - entry_check_offset))
+ return;
+
+#ifndef SHARED
+ CHECK_STATIC_TLS (l, l);
+#else
+ if (!TRY_STATIC_TLS (l, l))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic (l, addend);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+#endif
+ {
+ td->arg = (void*) (addend - l->l_tls_offset);
+ td->entry = _dl_tlsdesc_return;
+ }
+
+ _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC REL relocations
+ that originally had zero addends. The argument location, that
+ originally held the addend, is used to hold a pointer to the
+ relocation, but it has to be restored before we call the function
+ that applies relocations. */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rel_fixup (struct tlsdesc volatile *td,
+ struct link_map *l,
+ ptrdiff_t entry_check_offset)
+{
+ const ElfW(Rel) *reloc = td->arg;
+
+ if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+ - entry_check_offset))
+ return;
+
+ /* The code below was borrowed from _dl_fixup(),
+ except for checking for STB_LOCAL. */
+ const ElfW(Sym) *const symtab
+ = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+ const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+ const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+ lookup_t result;
+
+ /* Look up the target symbol. If the normal lookup rules are not
+ used don't look in the global scope. */
+ if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+ && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+ {
+ const struct r_found_version *version = NULL;
+
+ if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+ {
+ const ElfW(Half) *vernum =
+ (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+ ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+ version = &l->l_versions[ndx];
+ if (version->hash == 0)
+ version = NULL;
+ }
+
+ result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+ l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+ DL_LOOKUP_ADD_DEPENDENCY, NULL);
+ }
+ else
+ {
+ /* We already found the symbol. The module (and therefore its load
+ address) is also known. */
+ result = l;
+ }
+
+ if (!sym)
+ {
+ td->arg = 0;
+ td->entry = _dl_tlsdesc_undefweak;
+ }
+ else
+ {
+# ifndef SHARED
+ CHECK_STATIC_TLS (l, result);
+# else
+ if (!TRY_STATIC_TLS (l, result))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+# endif
+ {
+ td->arg = (void*)(sym->st_value - result->l_tls_offset);
+ td->entry = _dl_tlsdesc_return;
+ }
+ }
+
+ _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to lazily resolve TLS_DESC RELA relocations.
+ The argument location is used to hold a pointer to the relocation. */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
+ struct link_map *l,
+ ptrdiff_t entry_check_offset)
+{
+ const ElfW(Rela) *reloc = td->arg;
+
+ if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0)
+ - entry_check_offset))
+ return;
+
+ /* The code below was borrowed from _dl_fixup(),
+ except for checking for STB_LOCAL. */
+ const ElfW(Sym) *const symtab
+ = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
+ const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
+ const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+ lookup_t result;
+
+ /* Look up the target symbol. If the normal lookup rules are not
+ used don't look in the global scope. */
+ if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
+ && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
+ {
+ const struct r_found_version *version = NULL;
+
+ if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+ {
+ const ElfW(Half) *vernum =
+ (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
+ ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
+ version = &l->l_versions[ndx];
+ if (version->hash == 0)
+ version = NULL;
+ }
+
+ result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
+ l->l_scope, version, ELF_RTYPE_CLASS_PLT,
+ DL_LOOKUP_ADD_DEPENDENCY, NULL);
+ }
+ else
+ {
+ /* We already found the symbol. The module (and therefore its load
+ address) is also known. */
+ result = l;
+ }
+
+ if (!sym)
+ {
+ td->arg = (void*) reloc->r_addend;
+ td->entry = _dl_tlsdesc_undefweak;
+ }
+ else
+ {
+# ifndef SHARED
+ CHECK_STATIC_TLS (l, result);
+# else
+ if (!TRY_STATIC_TLS (l, result))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+ + reloc->r_addend);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+# endif
+ {
+ td->arg = (void*) (sym->st_value - result->l_tls_offset
+ + reloc->r_addend);
+ td->entry = _dl_tlsdesc_return;
+ }
+ }
+
+ _dl_tlsdesc_wake_up_held_fixups ();
+}
+
+/* This function is used to avoid busy waiting for other threads to
+ complete the lazy relocation. Once another thread wins the race to
+ relocate a TLS descriptor, it sets the descriptor up such that this
+ function is called to wait until the resolver releases the
+ lock. */
+
+void
+__attribute__ ((regparm (3))) attribute_hidden
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
+ struct link_map *l __attribute__((__unused__)),
+ ptrdiff_t entry_check_offset)
+{
+ /* Maybe we're lucky and can return early. */
+ if (__builtin_return_address (0) - entry_check_offset != td->entry)
+ return;
+
+ /* Locking here will stop execution until the running resolver runs
+ _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
+
+ FIXME: We'd be better off waiting on a condition variable, such
+ that we didn't have to hold the lock throughout the relocation
+ processing. */
+ __rtld_lock_lock_recursive (GL(dl_load_lock));
+ __rtld_lock_unlock_recursive (GL(dl_load_lock));
+}
+
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+ if there is one. */
+
+void
+internal_function
+_dl_unmap (struct link_map *map)
+{
+ _dl_unmap_segments (map);
+
+#ifdef SHARED
+ if (map->l_mach.tlsdesc_table)
+ htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.sym b/REORG.TODO/sysdeps/i386/tlsdesc.sym
new file mode 100644
index 0000000000..33854975d0
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tlsdesc.sym
@@ -0,0 +1,17 @@
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+DTV_OFFSET offsetof(struct pthread, header.dtv)
+
+TLSDESC_ARG offsetof(struct tlsdesc, arg)
+
+TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
diff --git a/REORG.TODO/sysdeps/i386/tst-audit.h b/REORG.TODO/sysdeps/i386/tst-audit.h
new file mode 100644
index 0000000000..87bf199c85
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit.h
@@ -0,0 +1,25 @@
+/* Definitions for testing PLT entry/exit auditing. i386 version.
+
+ Copyright (C) 2012-2017 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define pltenter la_i86_gnu_pltenter
+#define pltexit la_i86_gnu_pltexit
+#define La_regs La_i86_regs
+#define La_retval La_i86_retval
+#define int_retval lrv_eax
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.c b/REORG.TODO/sysdeps/i386/tst-audit3.c
new file mode 100644
index 0000000000..b67a59d733
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.c
@@ -0,0 +1,37 @@
+/* Test case for i386 preserved registers in dynamic linker.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+static int
+do_test (void)
+{
+ long long ll = audit1_test (1, 2, 3);
+ if (ll != 30)
+ abort ();
+
+ float f = audit2_test (1, 2, 3);
+ if (f != 30)
+ abort ();
+
+ return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.h b/REORG.TODO/sysdeps/i386/tst-audit3.h
new file mode 100644
index 0000000000..f6d3b9181e
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-audit3.h
@@ -0,0 +1,20 @@
+/* Test case for i386 preserved registers in dynamic linker.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+extern long long audit1_test (int, int, int) __attribute__ ((regparm(3)));
+extern float audit2_test (int, int, int) __attribute__ ((regparm(3)));
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3a.c b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
new file mode 100644
index 0000000000..a333cdcff9
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c
@@ -0,0 +1,38 @@
+/* Test case for i386 preserved registers in dynamic linker.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include "tst-audit3.h"
+
+long long
+__attribute__ ((regparm(3)))
+audit1_test (int i, int j, int k)
+{
+ if (i != 1 || j != 2 || k != 3)
+ abort ();
+ return 30;
+}
+
+float
+__attribute__ ((regparm(3)))
+audit2_test (int i, int j, int k)
+{
+ if (i != 1 || j != 2 || k != 3)
+ abort ();
+ return 30;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3b.c b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
new file mode 100644
index 0000000000..523f3cec90
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c
@@ -0,0 +1,186 @@
+/* Test case for i386 preserved registers in dynamic linker.
+ Copyright (C) 2015-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <link.h>
+#include <bits/wordsize.h>
+#include <gnu/lib-names.h>
+
+unsigned int
+la_version (unsigned int v)
+{
+ setlinebuf (stdout);
+
+ printf ("version: %u\n", v);
+
+ char buf[20];
+ sprintf (buf, "%u", v);
+
+ return v;
+}
+
+void
+la_activity (uintptr_t *cookie, unsigned int flag)
+{
+ const char *flagstr;
+ switch (flag)
+ {
+ case LA_ACT_CONSISTENT:
+ flagstr = "consistent";
+ break;
+ case LA_ACT_ADD:
+ flagstr = "add";
+ break;
+ case LA_ACT_DELETE:
+ flagstr = "delete";
+ break;
+ default:
+ printf ("activity: unknown activity %u\n", flag);
+ return;
+ }
+ printf ("activity: %s\n", flagstr);
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+ const char *flagstr;
+ switch (flag)
+ {
+ case LA_SER_ORIG:
+ flagstr = "LA_SET_ORIG";
+ break;
+ case LA_SER_LIBPATH:
+ flagstr = "LA_SER_LIBPATH";
+ break;
+ case LA_SER_RUNPATH:
+ flagstr = "LA_SER_RUNPATH";
+ break;
+ case LA_SER_CONFIG:
+ flagstr = "LA_SER_CONFIG";
+ break;
+ case LA_SER_DEFAULT:
+ flagstr = "LA_SER_DEFAULT";
+ break;
+ case LA_SER_SECURE:
+ flagstr = "LA_SER_SECURE";
+ break;
+ default:
+ printf ("objsearch: %s, unknown flag %d\n", name, flag);
+ return (char *) name;
+ }
+
+ printf ("objsearch: %s, %s\n", name, flagstr);
+ return (char *) name;
+}
+
+unsigned int
+la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie)
+{
+ printf ("objopen: %ld, %s\n", lmid, l->l_name);
+
+ return 3;
+}
+
+void
+la_preinit (uintptr_t *cookie)
+{
+ printf ("preinit\n");
+}
+
+unsigned int
+la_objclose (uintptr_t *cookie)
+{
+ printf ("objclose\n");
+ return 0;
+}
+
+uintptr_t
+la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook,
+ uintptr_t *defcook, unsigned int *flags, const char *symname)
+{
+ printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+ symname, (long int) sym->st_value, ndx, *flags);
+
+ return sym->st_value;
+}
+
+#include "tst-audit.h"
+
+ElfW(Addr)
+pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+ uintptr_t *defcook, La_regs *regs, unsigned int *flags,
+ const char *symname, long int *framesizep)
+{
+ printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+ symname, (long int) sym->st_value, ndx, *flags);
+
+ if (strcmp (symname, "audit1_test") == 0
+ || strcmp (symname, "audit2_test") == 0)
+ {
+ if (regs->lr_eax != 1
+ || regs->lr_edx != 2
+ || regs->lr_ecx != 3)
+ abort ();
+
+ *framesizep = 200;
+ }
+
+ return sym->st_value;
+}
+
+unsigned int
+pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+ uintptr_t *defcook, const La_regs *inregs, La_retval *outregs,
+ const char *symname)
+{
+ printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n",
+ symname, (long int) sym->st_value, ndx,
+ (ptrdiff_t) outregs->int_retval);
+
+ if (strcmp (symname, "audit1_test") == 0
+ || strcmp (symname, "audit2_test") == 0)
+ {
+ if (inregs->lr_eax != 1
+ || inregs->lr_edx != 2
+ || inregs->lr_ecx != 3)
+ abort ();
+
+ if (strcmp (symname, "audit1_test") == 0)
+ {
+ long long x = ((unsigned long long) outregs->lrv_eax
+ | (unsigned long long) outregs->lrv_edx << 32);
+
+ if (x != 30)
+ abort ();
+ }
+ else if (strcmp (symname, "audit2_test") == 0)
+ {
+ if (outregs->lrv_st0 != 30)
+ abort ();
+ }
+ }
+
+ return 0;
+}
diff --git a/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
new file mode 100755
index 0000000000..83a1dc59fb
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Make sure no code in ld.so uses xmm/ymm/zmm registers on i386.
+# Copyright (C) 2009-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+set -e
+
+objpfx="$1"
+NM="$2"
+OBJDUMP="$3"
+READELF="$4"
+
+tmp=$(mktemp ${objpfx}tst-ld-sse-use.XXXXXX)
+trap 'rm -f "$tmp"' 1 2 3 15
+
+# List of object files we have to test
+rtldobjs=$($READELF -W -wi ${objpfx}dl-allobjs.os |
+ awk '/^ </ { if ($5 == "(DW_TAG_compile_unit)") c=1; else c=0 } $2 == "DW_AT_name" { if (c == 1) print $NF }' |
+ sed 's,\(.*/\|\)\([_[:alnum:]-]*[.]\).$,\2os,')
+rtldobjs="$rtldobjs $(ar t ${objpfx}rtld-libc.a)"
+
+# OBJECT symbols can be ignored.
+$READELF -sW ${objpfx}dl-allobjs.os ${objpfx}rtld-libc.a |
+egrep " OBJECT *GLOBAL " |
+awk '{if ($7 != "ABS") print $8 }' |
+sort -u > "$tmp"
+declare -a objects
+objects=($(cat "$tmp"))
+
+objs="dl-runtime.os"
+tocheck="dl-runtime.os"
+
+while test -n "$objs"; do
+ this="$objs"
+ objs=""
+
+ for f in $this; do
+ undef=$($NM -u "$objpfx"../*/"$f" | awk '{print $2}')
+ if test -n "$undef"; then
+ for s in $undef; do
+ for obj in ${objects[*]} "_GLOBAL_OFFSET_TABLE_"; do
+ if test "$obj" = "$s"; then
+ continue 2
+ fi
+ done
+ for o in $rtldobjs; do
+ ro=$(echo "$objpfx"../*/"$o")
+ if $NM -g --defined-only "$ro" | egrep -qs " $s\$"; then
+ if ! (echo "$tocheck $objs" | fgrep -qs "$o"); then
+ echo "$o needed for $s"
+ objs="$objs $o"
+ fi
+ break;
+ fi
+ done
+ done
+ fi
+ done
+ tocheck="$tocheck$objs"
+done
+
+echo
+echo
+echo "object files needed: $tocheck"
+
+cp /dev/null "$tmp"
+for f in $tocheck; do
+ $OBJDUMP -d "$objpfx"../*/"$f" |
+ awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xyz]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
+ while read fct; do
+ if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then
+ continue;
+ fi
+ echo "function $fct in $f modifies xmm/ymm/zmm" >> "$tmp"
+ result=1
+ done
+done
+
+if test -s "$tmp"; then
+ echo
+ echo
+ cat "$tmp"
+ result=1
+else
+ result=0
+fi
+
+rm "$tmp"
+exit $result
diff --git a/REORG.TODO/sysdeps/i386/tst-stack-align.h b/REORG.TODO/sysdeps/i386/tst-stack-align.h
new file mode 100644
index 0000000000..76276d4a28
--- /dev/null
+++ b/REORG.TODO/sysdeps/i386/tst-stack-align.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
+
+#define TEST_STACK_ALIGN() \
+ ({ \
+ int_al16 _m; \
+ double _d = 12.0; \
+ long double _ld = 15.0; \
+ int _ret = 0; \
+ printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
+ if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
+ if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
+ if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
+ _ret = 1; \
+ _ret; \
+ })