Linux-Crypto Archive mirror
 help / color / mirror / Atom feed
From: "Stefan Kanthak" <stefan.kanthak@nexgo.de>
To: <linux-kernel@vger.kernel.org>, <linux-crypto@vger.kernel.org>
Cc: <tim.c.chen@linux.intel.com>, <sean.m.gulley@intel.com>
Subject: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S
Date: Mon, 8 Apr 2024 11:26:52 +0200	[thread overview]
Message-ID: <5EEE09A9021540A5AAD8BFEEE915512D@H270> (raw)

Use shorter SSE2 instructions instead of some SSE4.1
use short displacements into K256

--- -/arch/x86/crypto/sha256_ni_asm.S
+++ +/arch/x86/crypto/sha256_ni_asm.S
@@ -108,17 +108,17 @@
          * Need to reorder these appropriately
          * DCBA, HGFE -> ABEF, CDGH
          */
-        movdqu          0*16(DIGEST_PTR), STATE0
-        movdqu          1*16(DIGEST_PTR), STATE1
+        movdqu          0*16(DIGEST_PTR), STATE0        /* DCBA */
+        movdqu          1*16(DIGEST_PTR), STATE1        /* HGFE */
 
-        pshufd          $0xB1, STATE0,  STATE0          /* CDAB */
-        pshufd          $0x1B, STATE1,  STATE1          /* EFGH */
         movdqa          STATE0, MSGTMP4
-        palignr         $8, STATE1,  STATE0             /* ABEF */
-        pblendw         $0xF0, MSGTMP4, STATE1          /* CDGH */
+        punpcklqdq      STATE1, STATE0                  /* FEBA */
+        punpckhqdq      MSGTMP4, STATE1                 /* DCHG */
+        pshufd          $0x1B, STATE0,  STATE0          /* ABEF */
+        pshufd          $0xB1, STATE1,  STATE1          /* CDGH */
 
         movdqa          PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-        lea             K256(%rip), SHA256CONSTANTS
+        lea             K256+8*16(%rip), SHA256CONSTANTS
 
 .Lloop0:
         /* Save hash values for addition after rounds */
@@ -129,18 +129,18 @@
         movdqu          0*16(DATA_PTR), MSG
         pshufb          SHUF_MASK, MSG
         movdqa          MSG, MSGTMP0
-                paddd           0*16(SHA256CONSTANTS), MSG
+                paddd           -8*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
 
         /* Rounds 4-7 */
         movdqu          1*16(DATA_PTR), MSG
         pshufb          SHUF_MASK, MSG
         movdqa          MSG, MSGTMP1
-                paddd           1*16(SHA256CONSTANTS), MSG
+                paddd           -7*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP1, MSGTMP0
 
@@ -148,9 +148,9 @@
         movdqu          2*16(DATA_PTR), MSG
         pshufb          SHUF_MASK, MSG
         movdqa          MSG, MSGTMP2
-                paddd           2*16(SHA256CONSTANTS), MSG
+                paddd           -6*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP2, MSGTMP1
 
@@ -158,151 +158,151 @@
         movdqu          3*16(DATA_PTR), MSG
         pshufb          SHUF_MASK, MSG
         movdqa          MSG, MSGTMP3
-                paddd           3*16(SHA256CONSTANTS), MSG
+                paddd           -5*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP3, MSGTMP4
         palignr         $4, MSGTMP2, MSGTMP4
         paddd           MSGTMP4, MSGTMP0
         sha256msg2      MSGTMP3, MSGTMP0
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP3, MSGTMP2
 
         /* Rounds 16-19 */
         movdqa          MSGTMP0, MSG
-                paddd           4*16(SHA256CONSTANTS), MSG
+                paddd           -4*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP0, MSGTMP4
         palignr         $4, MSGTMP3, MSGTMP4
         paddd           MSGTMP4, MSGTMP1
         sha256msg2      MSGTMP0, MSGTMP1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP0, MSGTMP3
 
         /* Rounds 20-23 */
         movdqa          MSGTMP1, MSG
-                paddd           5*16(SHA256CONSTANTS), MSG
+                paddd           -3*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP1, MSGTMP4
         palignr         $4, MSGTMP0, MSGTMP4
         paddd           MSGTMP4, MSGTMP2
         sha256msg2      MSGTMP1, MSGTMP2
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP1, MSGTMP0
 
         /* Rounds 24-27 */
         movdqa          MSGTMP2, MSG
-                paddd           6*16(SHA256CONSTANTS), MSG
+                paddd           -2*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP2, MSGTMP4
         palignr         $4, MSGTMP1, MSGTMP4
         paddd           MSGTMP4, MSGTMP3
         sha256msg2      MSGTMP2, MSGTMP3
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP2, MSGTMP1
 
         /* Rounds 28-31 */
         movdqa          MSGTMP3, MSG
-                paddd           7*16(SHA256CONSTANTS), MSG
+                paddd           -1*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP3, MSGTMP4
         palignr         $4, MSGTMP2, MSGTMP4
         paddd           MSGTMP4, MSGTMP0
         sha256msg2      MSGTMP3, MSGTMP0
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP3, MSGTMP2
 
         /* Rounds 32-35 */
         movdqa          MSGTMP0, MSG
-                paddd           8*16(SHA256CONSTANTS), MSG
+                paddd           0*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP0, MSGTMP4
         palignr         $4, MSGTMP3, MSGTMP4
         paddd           MSGTMP4, MSGTMP1
         sha256msg2      MSGTMP0, MSGTMP1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP0, MSGTMP3
 
         /* Rounds 36-39 */
         movdqa          MSGTMP1, MSG
-                paddd           9*16(SHA256CONSTANTS), MSG
+                paddd           1*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP1, MSGTMP4
         palignr         $4, MSGTMP0, MSGTMP4
         paddd           MSGTMP4, MSGTMP2
         sha256msg2      MSGTMP1, MSGTMP2
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP1, MSGTMP0
 
         /* Rounds 40-43 */
         movdqa          MSGTMP2, MSG
-                paddd           10*16(SHA256CONSTANTS), MSG
+                paddd           2*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP2, MSGTMP4
         palignr         $4, MSGTMP1, MSGTMP4
         paddd           MSGTMP4, MSGTMP3
         sha256msg2      MSGTMP2, MSGTMP3
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP2, MSGTMP1
 
         /* Rounds 44-47 */
         movdqa          MSGTMP3, MSG
-                paddd           11*16(SHA256CONSTANTS), MSG
+                paddd           3*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP3, MSGTMP4
         palignr         $4, MSGTMP2, MSGTMP4
         paddd           MSGTMP4, MSGTMP0
         sha256msg2      MSGTMP3, MSGTMP0
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP3, MSGTMP2
 
         /* Rounds 48-51 */
         movdqa          MSGTMP0, MSG
-                paddd           12*16(SHA256CONSTANTS), MSG
+                paddd           4*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP0, MSGTMP4
         palignr         $4, MSGTMP3, MSGTMP4
         paddd           MSGTMP4, MSGTMP1
         sha256msg2      MSGTMP0, MSGTMP1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
         sha256msg1      MSGTMP0, MSGTMP3
 
         /* Rounds 52-55 */
         movdqa          MSGTMP1, MSG
-                paddd           13*16(SHA256CONSTANTS), MSG
+                paddd           5*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP1, MSGTMP4
         palignr         $4, MSGTMP0, MSGTMP4
         paddd           MSGTMP4, MSGTMP2
         sha256msg2      MSGTMP1, MSGTMP2
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
 
         /* Rounds 56-59 */
         movdqa          MSGTMP2, MSG
-                paddd           14*16(SHA256CONSTANTS), MSG
+                paddd           6*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
         movdqa          MSGTMP2, MSGTMP4
         palignr         $4, MSGTMP1, MSGTMP4
         paddd           MSGTMP4, MSGTMP3
         sha256msg2      MSGTMP2, MSGTMP3
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
 
         /* Rounds 60-63 */
         movdqa          MSGTMP3, MSG
-                paddd           15*16(SHA256CONSTANTS), MSG
+                paddd           7*16(SHA256CONSTANTS), MSG
                 sha256rnds2     STATE0, STATE1
-                pshufd          $0x0E, MSG, MSG
+                punpckhqdq      MSG, MSG
                 sha256rnds2     STATE1, STATE0
 
         /* Add current hash values with previously saved */
@@ -315,11 +315,11 @@
         jne             .Lloop0
 
         /* Write hash values back in the correct order */
-        pshufd          $0x1B, STATE0,  STATE0          /* FEBA */
-        pshufd          $0xB1, STATE1,  STATE1          /* DCHG */
         movdqa          STATE0, MSGTMP4
-        pblendw         $0xF0, STATE1,  STATE0          /* DCBA */
-        palignr         $8, MSGTMP4, STATE1             /* HGFE */
+        punpcklqdq      STATE1, STATE0                  /* GHEF */
+        punpckhqdq      MSGTMP4, STATE1                 /* ABCD */
+        pshufd          $0xB1, STATE0,  STATE0          /* HGFE */
+        pshufd          $0x1B, STATE1,  STATE1          /* DCBA */
 
         movdqu          STATE0, 0*16(DIGEST_PTR)
         movdqu          STATE1, 1*16(DIGEST_PTR)


             reply	other threads:[~2024-04-08  9:45 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-08  9:26 Stefan Kanthak [this message]
2024-04-08 12:37 ` [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S Eric Biggers
     [not found]   ` <9088939CC5454139901CEDD97DAFB004@H270>
2024-04-08 15:18     ` Eric Biggers
2024-04-09 10:23       ` Stefan Kanthak
2024-04-09 12:32         ` Eric Biggers
2024-04-08 13:12 ` Eric Biggers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5EEE09A9021540A5AAD8BFEEE915512D@H270 \
    --to=stefan.kanthak@nexgo.de \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sean.m.gulley@intel.com \
    --cc=tim.c.chen@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).