1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vsqrtf.S"
  30 
  31 #include "libm.h"
  32 
  33         ENTRY(__vsqrtf)
  34         push    %rbp
  35         movq    %rsp,%rbp
  36 
  37 / on entry:
  38 /   %edi = n
  39 /   %rsi = x
  40 /   %edx = stridex
  41 /   %rcx = y
  42 /   %r8d = stridey
  43 
  44         movslq  %edx,%rdx               / sign extend and scale strides
  45         shlq    $2,%rdx
  46         movslq  %r8d,%r8
  47         shlq    $2,%r8
  48 
  49         cmpl    $4,%edi
  50         jl      .finish
  51 
  52         cmpq    $4,%rdx
  53         jne     .nonunit
  54         cmpq    $4,%r8
  55         jne     .nonunit
  56 
  57 / unit-stride case
  58         movq    %rdx,%r9
  59         shlq    $2,%r9
  60         movq    %r8,%r10
  61         shlq    $2,%r10
  62 
  63         .align  16
  64 .loop:
  65         movups  (%rsi),%xmm0
  66         addq    %r9,%rsi
  67         sqrtps  %xmm0,%xmm0
  68         movups  %xmm0,(%rcx)
  69         addq    %r10,%rcx
  70         subl    $4,%edi
  71         cmpl    $4,%edi
  72         jge     .loop
  73 
  74 .finish:
  75         testl   %edi,%edi
  76         jle     .done
  77 
  78 .finish_loop:
  79         movss   (%rsi),%xmm0
  80         addq    %rdx,%rsi
  81         sqrtss  %xmm0,%xmm0
  82         movss   %xmm0,(%rcx)
  83         addq    %r8,%rcx
  84         decl    %edi
  85         jg      .finish_loop
  86 
  87 .done:
  88         leave
  89         ret
  90 
  91         .align  16
  92 .nonunit:
  93         movss   (%rsi),%xmm0
  94         addq    %rdx,%rsi
  95         movss   (%rsi),%xmm1
  96         addq    %rdx,%rsi
  97         movss   (%rsi),%xmm2
  98         addq    %rdx,%rsi
  99         movss   (%rsi),%xmm3
 100         addq    %rdx,%rsi
 101 
 102         movlhps %xmm1,%xmm0             / xmm0:   0  x1   0  x0
 103         movlhps %xmm3,%xmm2             / xmm2:   0  x3   0  x2
 104         shufps  $0x88,%xmm2,%xmm0       / xmm0:  x3  x2  x1  x0
 105 
 106         sqrtps  %xmm0,%xmm0             / xmm0:  y3  y2  y1  y0
 107 
 108         movaps  %xmm0,%xmm1             / xmm1:  y3  y2  y1  y0
 109         shufps  $0xf5,%xmm0,%xmm1       / xmm1:  y3  y3  y1  y1
 110         movhlps %xmm0,%xmm2             / xmm2:   0  x3  y3  y2
 111         movhlps %xmm1,%xmm3             / xmm3:   0   0  y3  y3
 112 
 113         movss   %xmm0,(%rcx)
 114         addq    %r8,%rcx
 115         movss   %xmm1,(%rcx)
 116         addq    %r8,%rcx
 117         movss   %xmm2,(%rcx)
 118         addq    %r8,%rcx
 119         movss   %xmm3,(%rcx)
 120         addq    %r8,%rcx
 121 
 122         subl    $4,%edi
 123         cmpl    $4,%edi
 124         jge     .nonunit
 125 
 126         jmp     .finish
 127 
 128         SET_SIZE(__vsqrtf)