diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst
index 6ca53071edb..2cb3c78f09a 100644
--- a/Doc/library/hashlib.rst
+++ b/Doc/library/hashlib.rst
@@ -69,7 +69,13 @@ Constructors for hash algorithms that are always present in this module are
 :func:`md5` is normally available as well, though it
 may be missing if you are using a rare "FIPS compliant" build of Python.
 Additional algorithms may also be available depending upon the OpenSSL
-library that Python uses on your platform.
+library that Python uses on your platform. On most platforms the
+:func:`sha3_224`, :func:`sha3_256`, :func:`sha3_384`, :func:`sha3_512`,
+:func:`shake_128`, :func:`shake_256` are also available.
+
+.. versionadded:: 3.6
+   SHA3 (Keccak) and SHAKE constructors :func:`sha3_224`, :func:`sha3_256`,
+   :func:`sha3_384`, :func:`sha3_512`, :func:`shake_128`, :func:`shake_256`.
 
 .. versionadded:: 3.6
    :func:`blake2b` and :func:`blake2s` were added.
@@ -189,6 +195,28 @@ A hash object has the following methods:
    compute the digests of data sharing a common initial substring.
 
 
+SHAKE variable length digests
+-----------------------------
+
+The :func:`shake_128` and :func:`shake_256` algorithms provide variable
+length digests with length_in_bits//2 up to 128 or 256 bits of security.
+As such, their digest methods require a length. Maximum length is not limited
+by the SHAKE algorithm.
+
+.. method:: shake.digest(length)
+
+   Return the digest of the data passed to the :meth:`update` method so far.
+   This is a bytes object of size ``length`` which may contain bytes in
+   the whole range from 0 to 255.
+
+
+.. method:: shake.hexdigest(length)
+
+   Like :meth:`digest` except the digest is returned as a string object of
+   double length, containing only hexadecimal digits.  This may be used to
+   exchange the value safely in email or other non-binary environments.
+
+
 Key derivation
 --------------
 
diff --git a/Lib/hashlib.py b/Lib/hashlib.py
index 2d5e92ea338..053a7add459 100644
--- a/Lib/hashlib.py
+++ b/Lib/hashlib.py
@@ -11,7 +11,8 @@ new(name, data=b'', **kwargs) - returns a new hash object implementing the
 Named constructor functions are also available, these are faster
 than using new(name):
 
-md5(), sha1(), sha224(), sha256(), sha384(), sha512(), blake2b(), and blake2s()
+md5(), sha1(), sha224(), sha256(), sha384(), sha512(), blake2b(), blake2s(),
+sha3_224, sha3_256, sha3_384, sha3_512, shake_128, and shake_256.
 
 More algorithms may be available on your platform but the above are guaranteed
 to exist.  See the algorithms_guaranteed and algorithms_available attributes
@@ -55,7 +56,10 @@ More condensed:
 # This tuple and __get_builtin_constructor() must be modified if a new
 # always available algorithm is added.
 __always_supported = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512',
-                      'blake2b', 'blake2s')
+                      'blake2b', 'blake2s',
+                      'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
+                      'shake_128', 'shake_256')
+
 
 algorithms_guaranteed = set(__always_supported)
 algorithms_available = set(__always_supported)
@@ -90,6 +94,15 @@ def __get_builtin_constructor(name):
             import _blake2
             cache['blake2b'] = _blake2.blake2b
             cache['blake2s'] = _blake2.blake2s
+        elif name in {'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
+                      'shake_128', 'shake_256'}:
+            import _sha3
+            cache['sha3_224'] = _sha3.sha3_224
+            cache['sha3_256'] = _sha3.sha3_256
+            cache['sha3_384'] = _sha3.sha3_384
+            cache['sha3_512'] = _sha3.sha3_512
+            cache['shake_128'] = _sha3.shake_128
+            cache['shake_256'] = _sha3.shake_256
     except ImportError:
         pass  # no extension module, this hash is unsupported.
 
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py
index e94fc3f1e85..21260f6ea63 100644
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -34,6 +34,13 @@ except ImportError:
 
 requires_blake2 = unittest.skipUnless(_blake2, 'requires _blake2')
 
+try:
+    import _sha3
+except ImportError:
+    _sha3 = None
+
+requires_sha3 = unittest.skipUnless(_sha3, 'requires _sha3')
+
 
 def hexstr(s):
     assert isinstance(s, bytes), repr(s)
@@ -61,7 +68,11 @@ class HashLibTestCase(unittest.TestCase):
     supported_hash_names = ( 'md5', 'MD5', 'sha1', 'SHA1',
                              'sha224', 'SHA224', 'sha256', 'SHA256',
                              'sha384', 'SHA384', 'sha512', 'SHA512',
-                             'blake2b', 'blake2s')
+                             'blake2b', 'blake2s',
+                             'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
+                             'shake_128', 'shake_256')
+
+    shakes = {'shake_128', 'shake_256'}
 
     # Issue #14693: fallback modules are always compiled under POSIX
     _warn_on_extension_import = os.name == 'posix' or COMPILED_WITH_PYDEBUG
@@ -131,6 +142,15 @@ class HashLibTestCase(unittest.TestCase):
             add_builtin_constructor('blake2s')
             add_builtin_constructor('blake2b')
 
+        _sha3 = self._conditional_import_module('_sha3')
+        if _sha3:
+            add_builtin_constructor('sha3_224')
+            add_builtin_constructor('sha3_256')
+            add_builtin_constructor('sha3_384')
+            add_builtin_constructor('sha3_512')
+            add_builtin_constructor('shake_128')
+            add_builtin_constructor('shake_256')
+
         super(HashLibTestCase, self).__init__(*args, **kwargs)
 
     @property
@@ -142,7 +162,10 @@ class HashLibTestCase(unittest.TestCase):
         a = array.array("b", range(10))
         for cons in self.hash_constructors:
             c = cons(a)
-            c.hexdigest()
+            if c.name in self.shakes:
+                c.hexdigest(16)
+            else:
+                c.hexdigest()
 
     def test_algorithms_guaranteed(self):
         self.assertEqual(hashlib.algorithms_guaranteed,
@@ -186,14 +209,21 @@ class HashLibTestCase(unittest.TestCase):
     def test_hexdigest(self):
         for cons in self.hash_constructors:
             h = cons()
-            self.assertIsInstance(h.digest(), bytes)
-            self.assertEqual(hexstr(h.digest()), h.hexdigest())
+            if h.name in self.shakes:
+                self.assertIsInstance(h.digest(16), bytes)
+                self.assertEqual(hexstr(h.digest(16)), h.hexdigest(16))
+            else:
+                self.assertIsInstance(h.digest(), bytes)
+                self.assertEqual(hexstr(h.digest()), h.hexdigest())
 
     def test_name_attribute(self):
         for cons in self.hash_constructors:
             h = cons()
             self.assertIsInstance(h.name, str)
-            self.assertIn(h.name, self.supported_hash_names)
+            if h.name in self.supported_hash_names:
+                self.assertIn(h.name, self.supported_hash_names)
+            else:
+                self.assertNotIn(h.name, self.supported_hash_names)
             self.assertEqual(h.name, hashlib.new(h.name).name)
 
     def test_large_update(self):
@@ -208,40 +238,46 @@ class HashLibTestCase(unittest.TestCase):
             m1.update(bees)
             m1.update(cees)
             m1.update(dees)
+            if m1.name in self.shakes:
+                args = (16,)
+            else:
+                args = ()
 
             m2 = cons()
             m2.update(aas + bees + cees + dees)
-            self.assertEqual(m1.digest(), m2.digest())
+            self.assertEqual(m1.digest(*args), m2.digest(*args))
 
             m3 = cons(aas + bees + cees + dees)
-            self.assertEqual(m1.digest(), m3.digest())
+            self.assertEqual(m1.digest(*args), m3.digest(*args))
 
             # verify copy() doesn't touch original
             m4 = cons(aas + bees + cees)
-            m4_digest = m4.digest()
+            m4_digest = m4.digest(*args)
             m4_copy = m4.copy()
             m4_copy.update(dees)
-            self.assertEqual(m1.digest(), m4_copy.digest())
-            self.assertEqual(m4.digest(), m4_digest)
+            self.assertEqual(m1.digest(*args), m4_copy.digest(*args))
+            self.assertEqual(m4.digest(*args), m4_digest)
 
-    def check(self, name, data, hexdigest, **kwargs):
+    def check(self, name, data, hexdigest, shake=False, **kwargs):
+        length = len(hexdigest)//2
         hexdigest = hexdigest.lower()
         constructors = self.constructors_to_test[name]
         # 2 is for hashlib.name(...) and hashlib.new(name, ...)
         self.assertGreaterEqual(len(constructors), 2)
         for hash_object_constructor in constructors:
             m = hash_object_constructor(data, **kwargs)
-            computed = m.hexdigest()
+            computed = m.hexdigest() if not shake else m.hexdigest(length)
             self.assertEqual(
                     computed, hexdigest,
                     "Hash algorithm %s constructed using %s returned hexdigest"
                     " %r for %d byte input data that should have hashed to %r."
                     % (name, hash_object_constructor,
                        computed, len(data), hexdigest))
-            computed = m.digest()
+            computed = m.digest() if not shake else m.digest(length)
             digest = bytes.fromhex(hexdigest)
             self.assertEqual(computed, digest)
-            self.assertEqual(len(digest), m.digest_size)
+            if not shake:
+                self.assertEqual(len(digest), m.digest_size)
 
     def check_no_unicode(self, algorithm_name):
         # Unicode objects are not allowed as input.
@@ -262,13 +298,30 @@ class HashLibTestCase(unittest.TestCase):
         self.check_no_unicode('blake2b')
         self.check_no_unicode('blake2s')
 
-    def check_blocksize_name(self, name, block_size=0, digest_size=0):
+    @requires_sha3
+    def test_no_unicode_sha3(self):
+        self.check_no_unicode('sha3_224')
+        self.check_no_unicode('sha3_256')
+        self.check_no_unicode('sha3_384')
+        self.check_no_unicode('sha3_512')
+        self.check_no_unicode('shake_128')
+        self.check_no_unicode('shake_256')
+
+    def check_blocksize_name(self, name, block_size=0, digest_size=0,
+                             digest_length=None):
         constructors = self.constructors_to_test[name]
         for hash_object_constructor in constructors:
             m = hash_object_constructor()
             self.assertEqual(m.block_size, block_size)
             self.assertEqual(m.digest_size, digest_size)
-            self.assertEqual(len(m.digest()), digest_size)
+            if digest_length:
+                self.assertEqual(len(m.digest(digest_length)),
+                                 digest_length)
+                self.assertEqual(len(m.hexdigest(digest_length)),
+                                 2*digest_length)
+            else:
+                self.assertEqual(len(m.digest()), digest_size)
+                self.assertEqual(len(m.hexdigest()), 2*digest_size)
             self.assertEqual(m.name, name)
             # split for sha3_512 / _sha3.sha3 object
             self.assertIn(name.split("_")[0], repr(m))
@@ -280,6 +333,12 @@ class HashLibTestCase(unittest.TestCase):
         self.check_blocksize_name('sha256', 64, 32)
         self.check_blocksize_name('sha384', 128, 48)
         self.check_blocksize_name('sha512', 128, 64)
+        self.check_blocksize_name('sha3_224', 144, 28)
+        self.check_blocksize_name('sha3_256', 136, 32)
+        self.check_blocksize_name('sha3_384', 104, 48)
+        self.check_blocksize_name('sha3_512', 72, 64)
+        self.check_blocksize_name('shake_128', 168, 0, 32)
+        self.check_blocksize_name('shake_256', 136, 0, 64)
 
     @requires_blake2
     def test_blocksize_name_blake2(self):
@@ -563,6 +622,72 @@ class HashLibTestCase(unittest.TestCase):
             key = bytes.fromhex(key)
             self.check('blake2s', msg, md, key=key)
 
+    @requires_sha3
+    def test_case_sha3_224_0(self):
+        self.check('sha3_224', b"",
+          "6b4e03423667dbb73b6e15454f0eb1abd4597f9a1b078e3f5b5a6bc7")
+
+    @requires_sha3
+    def test_case_sha3_224_vector(self):
+        for msg, md in read_vectors('sha3_224'):
+            self.check('sha3_224', msg, md)
+
+    @requires_sha3
+    def test_case_sha3_256_0(self):
+        self.check('sha3_256', b"",
+          "a7ffc6f8bf1ed76651c14756a061d662f580ff4de43b49fa82d80a4b80f8434a")
+
+    @requires_sha3
+    def test_case_sha3_256_vector(self):
+        for msg, md in read_vectors('sha3_256'):
+            self.check('sha3_256', msg, md)
+
+    @requires_sha3
+    def test_case_sha3_384_0(self):
+        self.check('sha3_384', b"",
+          "0c63a75b845e4f7d01107d852e4c2485c51a50aaaa94fc61995e71bbee983a2a"+
+          "c3713831264adb47fb6bd1e058d5f004")
+
+    @requires_sha3
+    def test_case_sha3_384_vector(self):
+        for msg, md in read_vectors('sha3_384'):
+            self.check('sha3_384', msg, md)
+
+    @requires_sha3
+    def test_case_sha3_512_0(self):
+        self.check('sha3_512', b"",
+          "a69f73cca23a9ac5c8b567dc185a756e97c982164fe25859e0d1dcc1475c80a6"+
+          "15b2123af1f5f94c11e3e9402c3ac558f500199d95b6d3e301758586281dcd26")
+
+    @requires_sha3
+    def test_case_sha3_512_vector(self):
+        for msg, md in read_vectors('sha3_512'):
+            self.check('sha3_512', msg, md)
+
+    @requires_sha3
+    def test_case_shake_128_0(self):
+        self.check('shake_128', b"",
+          "7f9c2ba4e88f827d616045507605853ed73b8093f6efbc88eb1a6eacfa66ef26",
+          True)
+        self.check('shake_128', b"", "7f9c", True)
+
+    @requires_sha3
+    def test_case_shake128_vector(self):
+        for msg, md in read_vectors('shake_128'):
+            self.check('shake_128', msg, md, True)
+
+    @requires_sha3
+    def test_case_shake_256_0(self):
+        self.check('shake_256', b"",
+          "46b9dd2b0ba88d13233b3feb743eeb243fcd52ea62b81b82b50c27646ed5762f",
+          True)
+        self.check('shake_256', b"", "46b9", True)
+
+    @requires_sha3
+    def test_case_shake256_vector(self):
+        for msg, md in read_vectors('shake_256'):
+            self.check('shake_256', msg, md, True)
+
     def test_gil(self):
         # Check things work fine with an input larger than the size required
         # for multithreaded operation (which is hardwired to 2048).
diff --git a/Misc/NEWS b/Misc/NEWS
index 1335af14522..b3673067038 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -91,6 +91,8 @@ Core and Builtins
 Library
 -------
 
+- Issue #16113: Add SHA-3 and SHAKE support to hashlib module.
+
 - Issue #27776: The :func:`os.urandom` function does now block on Linux 3.17
   and newer until the system urandom entropy pool is initialized to increase
   the security. This change is part of the :pep:`524`.
diff --git a/Modules/_sha3/README.txt b/Modules/_sha3/README.txt
new file mode 100644
index 00000000000..e34b1d12f70
--- /dev/null
+++ b/Modules/_sha3/README.txt
@@ -0,0 +1,11 @@
+Keccak Code Package
+===================
+
+The files in kcp are taken from the Keccak Code Package. They have been
+slightly to be C89 compatible. The architecture specific header file
+KeccakP-1600-SnP.h ha been renamed to KeccakP-1600-SnP-opt32.h or
+KeccakP-1600-SnP-opt64.h.
+
+The 64bit files were generated with generic64lc/libkeccak.a.pack target, the
+32bit files with generic32lc/libkeccak.a.pack.
+
diff --git a/Modules/_sha3/cleanup.py b/Modules/_sha3/cleanup.py
new file mode 100755
index 00000000000..17c56b34b26
--- /dev/null
+++ b/Modules/_sha3/cleanup.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright (C) 2012   Christian Heimes (christian@python.org)
+# Licensed to PSF under a Contributor Agreement.
+#
+# cleanup Keccak sources
+
+import os
+import re
+
+CPP1 = re.compile("^//(.*)")
+CPP2 = re.compile("\ //(.*)")
+
+STATICS = ("void ", "int ", "HashReturn ",
+           "const UINT64 ", "UINT16 ", "    int prefix##")
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+KECCAK = os.path.join(HERE, "kcp")
+
+def getfiles():
+    for name in os.listdir(KECCAK):
+        name = os.path.join(KECCAK, name)
+        if os.path.isfile(name):
+            yield name
+
+def cleanup(f):
+    buf = []
+    for line in f:
+        # mark all functions and global data as static
+        #if line.startswith(STATICS):
+        #    buf.append("static " + line)
+        #    continue
+        # remove UINT64 typedef, we have our own
+        if line.startswith("typedef unsigned long long int"):
+            buf.append("/* %s */\n" % line.strip())
+            continue
+        ## remove #include "brg_endian.h"
+        if "brg_endian.h" in line:
+            buf.append("/* %s */\n" % line.strip())
+            continue
+        # transform C++ comments into ANSI C comments
+        line = CPP1.sub(r"/*\1 */\n", line)
+        line = CPP2.sub(r" /*\1 */\n", line)
+        buf.append(line)
+    return "".join(buf)
+
+for name in getfiles():
+    with open(name) as f:
+        res = cleanup(f)
+    with open(name, "w") as f:
+        f.write(res)
diff --git a/Modules/_sha3/clinic/sha3module.c.h b/Modules/_sha3/clinic/sha3module.c.h
new file mode 100644
index 00000000000..bfd95cd6ef0
--- /dev/null
+++ b/Modules/_sha3/clinic/sha3module.c.h
@@ -0,0 +1,148 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+PyDoc_STRVAR(py_sha3_new__doc__,
+"sha3_224(string=None)\n"
+"--\n"
+"\n"
+"Return a new SHA3 hash object with a hashbit length of 28 bytes.");
+
+static PyObject *
+py_sha3_new_impl(PyTypeObject *type, PyObject *data);
+
+static PyObject *
+py_sha3_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+    PyObject *return_value = NULL;
+    static char *_keywords[] = {"string", NULL};
+    PyObject *data = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:sha3_224", _keywords,
+        &data))
+        goto exit;
+    return_value = py_sha3_new_impl(type, data);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_sha3_sha3_224_copy__doc__,
+"copy($self, /)\n"
+"--\n"
+"\n"
+"Return a copy of the hash object.");
+
+#define _SHA3_SHA3_224_COPY_METHODDEF    \
+    {"copy", (PyCFunction)_sha3_sha3_224_copy, METH_NOARGS, _sha3_sha3_224_copy__doc__},
+
+static PyObject *
+_sha3_sha3_224_copy_impl(SHA3object *self);
+
+static PyObject *
+_sha3_sha3_224_copy(SHA3object *self, PyObject *Py_UNUSED(ignored))
+{
+    return _sha3_sha3_224_copy_impl(self);
+}
+
+PyDoc_STRVAR(_sha3_sha3_224_digest__doc__,
+"digest($self, /)\n"
+"--\n"
+"\n"
+"Return the digest value as a string of binary data.");
+
+#define _SHA3_SHA3_224_DIGEST_METHODDEF    \
+    {"digest", (PyCFunction)_sha3_sha3_224_digest, METH_NOARGS, _sha3_sha3_224_digest__doc__},
+
+static PyObject *
+_sha3_sha3_224_digest_impl(SHA3object *self);
+
+static PyObject *
+_sha3_sha3_224_digest(SHA3object *self, PyObject *Py_UNUSED(ignored))
+{
+    return _sha3_sha3_224_digest_impl(self);
+}
+
+PyDoc_STRVAR(_sha3_sha3_224_hexdigest__doc__,
+"hexdigest($self, /)\n"
+"--\n"
+"\n"
+"Return the digest value as a string of hexadecimal digits.");
+
+#define _SHA3_SHA3_224_HEXDIGEST_METHODDEF    \
+    {"hexdigest", (PyCFunction)_sha3_sha3_224_hexdigest, METH_NOARGS, _sha3_sha3_224_hexdigest__doc__},
+
+static PyObject *
+_sha3_sha3_224_hexdigest_impl(SHA3object *self);
+
+static PyObject *
+_sha3_sha3_224_hexdigest(SHA3object *self, PyObject *Py_UNUSED(ignored))
+{
+    return _sha3_sha3_224_hexdigest_impl(self);
+}
+
+PyDoc_STRVAR(_sha3_sha3_224_update__doc__,
+"update($self, obj, /)\n"
+"--\n"
+"\n"
+"Update this hash object\'s state with the provided string.");
+
+#define _SHA3_SHA3_224_UPDATE_METHODDEF    \
+    {"update", (PyCFunction)_sha3_sha3_224_update, METH_O, _sha3_sha3_224_update__doc__},
+
+PyDoc_STRVAR(_sha3_shake_128_digest__doc__,
+"digest($self, /, length)\n"
+"--\n"
+"\n"
+"Return the digest value as a string of binary data.");
+
+#define _SHA3_SHAKE_128_DIGEST_METHODDEF    \
+    {"digest", (PyCFunction)_sha3_shake_128_digest, METH_VARARGS|METH_KEYWORDS, _sha3_shake_128_digest__doc__},
+
+static PyObject *
+_sha3_shake_128_digest_impl(SHA3object *self, unsigned long length);
+
+static PyObject *
+_sha3_shake_128_digest(SHA3object *self, PyObject *args, PyObject *kwargs)
+{
+    PyObject *return_value = NULL;
+    static char *_keywords[] = {"length", NULL};
+    unsigned long length;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "k:digest", _keywords,
+        &length))
+        goto exit;
+    return_value = _sha3_shake_128_digest_impl(self, length);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_sha3_shake_128_hexdigest__doc__,
+"hexdigest($self, /, length)\n"
+"--\n"
+"\n"
+"Return the digest value as a string of hexadecimal digits.");
+
+#define _SHA3_SHAKE_128_HEXDIGEST_METHODDEF    \
+    {"hexdigest", (PyCFunction)_sha3_shake_128_hexdigest, METH_VARARGS|METH_KEYWORDS, _sha3_shake_128_hexdigest__doc__},
+
+static PyObject *
+_sha3_shake_128_hexdigest_impl(SHA3object *self, unsigned long length);
+
+static PyObject *
+_sha3_shake_128_hexdigest(SHA3object *self, PyObject *args, PyObject *kwargs)
+{
+    PyObject *return_value = NULL;
+    static char *_keywords[] = {"length", NULL};
+    unsigned long length;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "k:hexdigest", _keywords,
+        &length))
+        goto exit;
+    return_value = _sha3_shake_128_hexdigest_impl(self, length);
+
+exit:
+    return return_value;
+}
+/*[clinic end generated code: output=2eb6db41778eeb50 input=a9049054013a1b77]*/
diff --git a/Modules/_sha3/kcp/KeccakHash.c b/Modules/_sha3/kcp/KeccakHash.c
new file mode 100644
index 00000000000..e09fb43cace
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakHash.c
@@ -0,0 +1,82 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakHash.h"
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *instance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix)
+{
+    HashReturn result;
+
+    if (delimitedSuffix == 0)
+        return FAIL;
+    result = (HashReturn)KeccakWidth1600_SpongeInitialize(&instance->sponge, rate, capacity);
+    if (result != SUCCESS)
+        return result;
+    instance->fixedOutputLength = hashbitlen;
+    instance->delimitedSuffix = delimitedSuffix;
+    return SUCCESS;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) == 0)
+        return (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+    else {
+        HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+        if (ret == SUCCESS) {
+            /* The last partial byte is assumed to be aligned on the least significant bits */
+
+            unsigned char lastByte = data[databitlen/8];
+            /* Concatenate the last few bits provided here with those of the suffix */
+
+            unsigned short delimitedLastBytes = (unsigned short)((unsigned short)lastByte | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
+            if ((delimitedLastBytes & 0xFF00) == 0x0000) {
+                instance->delimitedSuffix = delimitedLastBytes & 0xFF;
+            }
+            else {
+                unsigned char oneByte[1];
+                oneByte[0] = delimitedLastBytes & 0xFF;
+                ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, oneByte, 1);
+                instance->delimitedSuffix = (delimitedLastBytes >> 8) & 0xFF;
+            }
+        }
+        return ret;
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
+{
+    HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorbLastFewBits(&instance->sponge, instance->delimitedSuffix);
+    if (ret == SUCCESS)
+        return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, hashval, instance->fixedOutputLength/8);
+    else
+        return ret;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) != 0)
+        return FAIL;
+    return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, data, databitlen/8);
+}
diff --git a/Modules/_sha3/kcp/KeccakHash.h b/Modules/_sha3/kcp/KeccakHash.h
new file mode 100644
index 00000000000..bbd3dc64a22
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakHash.h
@@ -0,0 +1,114 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakHashInterface_h_
+#define _KeccakHashInterface_h_
+
+#ifndef KeccakP1600_excluded
+
+#include "KeccakSponge.h"
+#include <string.h>
+
+typedef unsigned char BitSequence;
+typedef size_t DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef struct {
+    KeccakWidth1600_SpongeInstance sponge;
+    unsigned int fixedOutputLength;
+    unsigned char delimitedSuffix;
+} Keccak_HashInstance;
+
+/**
+  * Function to initialize the Keccak[r, c] sponge function instance used in sequential hashing mode.
+  * @param  hashInstance    Pointer to the hash instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  hashbitlen  The desired number of output bits,
+  *                     or 0 for an arbitrarily-long output.
+  * @param  delimitedSuffix Bits that will be automatically appended to the end
+  *                         of the input message, as in domain separation.
+  *                         This is a byte containing from 0 to 7 bits
+  *                         formatted like the @a delimitedData parameter of
+  *                         the Keccak_SpongeAbsorbLastFewBits() function.
+  * @pre    One must have r+c=1600 and the rate a multiple of 8 bits in this implementation.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix);
+
+/** Macro to initialize a SHAKE128 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE128(hashInstance)        Keccak_HashInitialize(hashInstance, 1344,  256,   0, 0x1F)
+
+/** Macro to initialize a SHAKE256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512,   0, 0x1F)
+
+/** Macro to initialize a SHA3-224 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_224(hashInstance)        Keccak_HashInitialize(hashInstance, 1152,  448, 224, 0x06)
+
+/** Macro to initialize a SHA3-256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512, 256, 0x06)
+
+/** Macro to initialize a SHA3-384 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_384(hashInstance)        Keccak_HashInitialize(hashInstance,  832,  768, 384, 0x06)
+
+/** Macro to initialize a SHA3-512 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_512(hashInstance)        Keccak_HashInitialize(hashInstance,  576, 1024, 512, 0x06)
+
+/**
+  * Function to give input data to be absorbed.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the input data.
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the least significant bits of the last byte (little-endian convention).
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Keccak_HashUpdate(), databitlen was a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, DataLength databitlen);
+
+/**
+  * Function to call after all input blocks have been input and to get
+  * output bits if the length was specified when calling Keccak_HashInitialize().
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * If @a hashbitlen was not 0 in the call to Keccak_HashInitialize(), the number of
+  *     output bits is equal to @a hashbitlen.
+  * If @a hashbitlen was 0 in the call to Keccak_HashInitialize(), the output bits
+  *     must be extracted using the Keccak_HashSqueeze() function.
+  * @param  state       Pointer to the state of the sponge function initialized by Init().
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hashval);
+
+ /**
+  * Function to squeeze output data.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  databitlen  The number of output bits desired (must be a multiple of 8).
+  * @pre    Keccak_HashFinal() must have been already called.
+  * @pre    @a databitlen is a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, DataLength databitlen);
+
+#endif
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-64.macros b/Modules/_sha3/kcp/KeccakP-1600-64.macros
new file mode 100644
index 00000000000..1f11fe3e79f
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-64.macros
@@ -0,0 +1,2208 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT64 Aba, Abe, Abi, Abo, Abu; \
+    UINT64 Aga, Age, Agi, Ago, Agu; \
+    UINT64 Aka, Ake, Aki, Ako, Aku; \
+    UINT64 Ama, Ame, Ami, Amo, Amu; \
+    UINT64 Asa, Ase, Asi, Aso, Asu; \
+    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
+    UINT64 Bka, Bke, Bki, Bko, Bku; \
+    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
+    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
+    UINT64 Ca, Ce, Ci, Co, Cu; \
+    UINT64 Da, De, Di, Do, Du; \
+    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    UINT64 Ega, Ege, Egi, Ego, Egu; \
+    UINT64 Eka, Eke, Eki, Eko, Eku; \
+    UINT64 Ema, Eme, Emi, Emo, Emu; \
+    UINT64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = Aba^Aga^Aka^Ama^Asa; \
+    Ce = Abe^Age^Ake^Ame^Ase; \
+    Ci = Abi^Agi^Aki^Ami^Asi; \
+    Co = Abo^Ago^Ako^Amo^Aso; \
+    Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round (lane complementing pattern 'bebigokimisa') */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+\
+
+#else /* UseBebigokimisa */
+
+/* --- Code for round, with prepare-theta */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round */
+
+/* --- 64-bit lanes mapped to 64-bit words */
+
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+\
+
+#endif /* UseBebigokimisa */
+
+
+#define copyFromState(X, state) \
+    X##ba = state[ 0]; \
+    X##be = state[ 1]; \
+    X##bi = state[ 2]; \
+    X##bo = state[ 3]; \
+    X##bu = state[ 4]; \
+    X##ga = state[ 5]; \
+    X##ge = state[ 6]; \
+    X##gi = state[ 7]; \
+    X##go = state[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba; \
+    state[ 1] = X##be; \
+    state[ 2] = X##bi; \
+    state[ 3] = X##bo; \
+    state[ 4] = X##bu; \
+    state[ 5] = X##ga; \
+    state[ 6] = X##ge; \
+    state[ 7] = X##gi; \
+    state[ 8] = X##go; \
+    state[ 9] = X##gu; \
+    state[10] = X##ka; \
+    state[11] = X##ke; \
+    state[12] = X##ki; \
+    state[13] = X##ko; \
+    state[14] = X##ku; \
+    state[15] = X##ma; \
+    state[16] = X##me; \
+    state[17] = X##mi; \
+    state[18] = X##mo; \
+    state[19] = X##mu; \
+    state[20] = X##sa; \
+    state[21] = X##se; \
+    state[22] = X##si; \
+    state[23] = X##so; \
+    state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
+#define copyFromStateAndAdd(X, state, input, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba = state[ 0]; \
+                    } \
+                    else { \
+                        X##ba = state[ 0]^input[ 0]; \
+                    } \
+                    X##be = state[ 1]; \
+                    X##bi = state[ 2]; \
+                } \
+                else { \
+                    X##ba = state[ 0]^input[ 0]; \
+                    X##be = state[ 1]^input[ 1]; \
+                    if (laneCount < 3) { \
+                        X##bi = state[ 2]; \
+                    } \
+                    else { \
+                        X##bi = state[ 2]^input[ 2]; \
+                    } \
+                } \
+                X##bo = state[ 3]; \
+                X##bu = state[ 4]; \
+                X##ga = state[ 5]; \
+                X##ge = state[ 6]; \
+            } \
+            else { \
+                X##ba = state[ 0]^input[ 0]; \
+                X##be = state[ 1]^input[ 1]; \
+                X##bi = state[ 2]^input[ 2]; \
+                X##bo = state[ 3]^input[ 3]; \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu = state[ 4]; \
+                    } \
+                    else { \
+                        X##bu = state[ 4]^input[ 4]; \
+                    } \
+                    X##ga = state[ 5]; \
+                    X##ge = state[ 6]; \
+                } \
+                else { \
+                    X##bu = state[ 4]^input[ 4]; \
+                    X##ga = state[ 5]^input[ 5]; \
+                    if (laneCount < 7) { \
+                        X##ge = state[ 6]; \
+                    } \
+                    else { \
+                        X##ge = state[ 6]^input[ 6]; \
+                    } \
+                } \
+            } \
+            X##gi = state[ 7]; \
+            X##go = state[ 8]; \
+            X##gu = state[ 9]; \
+            X##ka = state[10]; \
+            X##ke = state[11]; \
+            X##ki = state[12]; \
+            X##ko = state[13]; \
+            X##ku = state[14]; \
+        } \
+        else { \
+            X##ba = state[ 0]^input[ 0]; \
+            X##be = state[ 1]^input[ 1]; \
+            X##bi = state[ 2]^input[ 2]; \
+            X##bo = state[ 3]^input[ 3]; \
+            X##bu = state[ 4]^input[ 4]; \
+            X##ga = state[ 5]^input[ 5]; \
+            X##ge = state[ 6]^input[ 6]; \
+            X##gi = state[ 7]^input[ 7]; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go = state[ 8]; \
+                    } \
+                    else { \
+                        X##go = state[ 8]^input[ 8]; \
+                    } \
+                    X##gu = state[ 9]; \
+                    X##ka = state[10]; \
+                } \
+                else { \
+                    X##go = state[ 8]^input[ 8]; \
+                    X##gu = state[ 9]^input[ 9]; \
+                    if (laneCount < 11) { \
+                        X##ka = state[10]; \
+                    } \
+                    else { \
+                        X##ka = state[10]^input[10]; \
+                    } \
+                } \
+                X##ke = state[11]; \
+                X##ki = state[12]; \
+                X##ko = state[13]; \
+                X##ku = state[14]; \
+            } \
+            else { \
+                X##go = state[ 8]^input[ 8]; \
+                X##gu = state[ 9]^input[ 9]; \
+                X##ka = state[10]^input[10]; \
+                X##ke = state[11]^input[11]; \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki = state[12]; \
+                    } \
+                    else { \
+                        X##ki = state[12]^input[12]; \
+                    } \
+                    X##ko = state[13]; \
+                    X##ku = state[14]; \
+                } \
+                else { \
+                    X##ki = state[12]^input[12]; \
+                    X##ko = state[13]^input[13]; \
+                    if (laneCount < 15) { \
+                        X##ku = state[14]; \
+                    } \
+                    else { \
+                        X##ku = state[14]^input[14]; \
+                    } \
+                } \
+            } \
+        } \
+        X##ma = state[15]; \
+        X##me = state[16]; \
+        X##mi = state[17]; \
+        X##mo = state[18]; \
+        X##mu = state[19]; \
+        X##sa = state[20]; \
+        X##se = state[21]; \
+        X##si = state[22]; \
+        X##so = state[23]; \
+        X##su = state[24]; \
+    } \
+    else { \
+        X##ba = state[ 0]^input[ 0]; \
+        X##be = state[ 1]^input[ 1]; \
+        X##bi = state[ 2]^input[ 2]; \
+        X##bo = state[ 3]^input[ 3]; \
+        X##bu = state[ 4]^input[ 4]; \
+        X##ga = state[ 5]^input[ 5]; \
+        X##ge = state[ 6]^input[ 6]; \
+        X##gi = state[ 7]^input[ 7]; \
+        X##go = state[ 8]^input[ 8]; \
+        X##gu = state[ 9]^input[ 9]; \
+        X##ka = state[10]^input[10]; \
+        X##ke = state[11]^input[11]; \
+        X##ki = state[12]^input[12]; \
+        X##ko = state[13]^input[13]; \
+        X##ku = state[14]^input[14]; \
+        X##ma = state[15]^input[15]; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me = state[16]; \
+                    } \
+                    else { \
+                        X##me = state[16]^input[16]; \
+                    } \
+                    X##mi = state[17]; \
+                    X##mo = state[18]; \
+                } \
+                else { \
+                    X##me = state[16]^input[16]; \
+                    X##mi = state[17]^input[17]; \
+                    if (laneCount < 19) { \
+                        X##mo = state[18]; \
+                    } \
+                    else { \
+                        X##mo = state[18]^input[18]; \
+                    } \
+                } \
+                X##mu = state[19]; \
+                X##sa = state[20]; \
+                X##se = state[21]; \
+                X##si = state[22]; \
+            } \
+            else { \
+                X##me = state[16]^input[16]; \
+                X##mi = state[17]^input[17]; \
+                X##mo = state[18]^input[18]; \
+                X##mu = state[19]^input[19]; \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa = state[20]; \
+                    } \
+                    else { \
+                        X##sa = state[20]^input[20]; \
+                    } \
+                    X##se = state[21]; \
+                    X##si = state[22]; \
+                } \
+                else { \
+                    X##sa = state[20]^input[20]; \
+                    X##se = state[21]^input[21]; \
+                    if (laneCount < 23) { \
+                        X##si = state[22]; \
+                    } \
+                    else { \
+                        X##si = state[22]^input[22]; \
+                    } \
+                } \
+            } \
+            X##so = state[23]; \
+            X##su = state[24]; \
+        } \
+        else { \
+            X##me = state[16]^input[16]; \
+            X##mi = state[17]^input[17]; \
+            X##mo = state[18]^input[18]; \
+            X##mu = state[19]^input[19]; \
+            X##sa = state[20]^input[20]; \
+            X##se = state[21]^input[21]; \
+            X##si = state[22]^input[22]; \
+            X##so = state[23]^input[23]; \
+            if (laneCount < 25) { \
+                X##su = state[24]; \
+            } \
+            else { \
+                X##su = state[24]^input[24]; \
+            } \
+        } \
+    }
+
+#define addInput(X, input, laneCount) \
+    if (laneCount == 21) { \
+        X##ba ^= input[ 0]; \
+        X##be ^= input[ 1]; \
+        X##bi ^= input[ 2]; \
+        X##bo ^= input[ 3]; \
+        X##bu ^= input[ 4]; \
+        X##ga ^= input[ 5]; \
+        X##ge ^= input[ 6]; \
+        X##gi ^= input[ 7]; \
+        X##go ^= input[ 8]; \
+        X##gu ^= input[ 9]; \
+        X##ka ^= input[10]; \
+        X##ke ^= input[11]; \
+        X##ki ^= input[12]; \
+        X##ko ^= input[13]; \
+        X##ku ^= input[14]; \
+        X##ma ^= input[15]; \
+        X##me ^= input[16]; \
+        X##mi ^= input[17]; \
+        X##mo ^= input[18]; \
+        X##mu ^= input[19]; \
+        X##sa ^= input[20]; \
+    } \
+    else if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                    } \
+                    else { \
+                        X##ba ^= input[ 0]; \
+                    } \
+                } \
+                else { \
+                    X##ba ^= input[ 0]; \
+                    X##be ^= input[ 1]; \
+                    if (laneCount < 3) { \
+                    } \
+                    else { \
+                        X##bi ^= input[ 2]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##ba ^= input[ 0]; \
+                X##be ^= input[ 1]; \
+                X##bi ^= input[ 2]; \
+                X##bo ^= input[ 3]; \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                    } \
+                    else { \
+                        X##bu ^= input[ 4]; \
+                    } \
+                } \
+                else { \
+                    X##bu ^= input[ 4]; \
+                    X##ga ^= input[ 5]; \
+                    if (laneCount < 7) { \
+                    } \
+                    else { \
+                        X##ge ^= input[ 6]; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            X##ba ^= input[ 0]; \
+            X##be ^= input[ 1]; \
+            X##bi ^= input[ 2]; \
+            X##bo ^= input[ 3]; \
+            X##bu ^= input[ 4]; \
+            X##ga ^= input[ 5]; \
+            X##ge ^= input[ 6]; \
+            X##gi ^= input[ 7]; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                    } \
+                    else { \
+                        X##go ^= input[ 8]; \
+                    } \
+                } \
+                else { \
+                    X##go ^= input[ 8]; \
+                    X##gu ^= input[ 9]; \
+                    if (laneCount < 11) { \
+                    } \
+                    else { \
+                        X##ka ^= input[10]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##go ^= input[ 8]; \
+                X##gu ^= input[ 9]; \
+                X##ka ^= input[10]; \
+                X##ke ^= input[11]; \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                    } \
+                    else { \
+                        X##ki ^= input[12]; \
+                    } \
+                } \
+                else { \
+                    X##ki ^= input[12]; \
+                    X##ko ^= input[13]; \
+                    if (laneCount < 15) { \
+                    } \
+                    else { \
+                        X##ku ^= input[14]; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        X##ba ^= input[ 0]; \
+        X##be ^= input[ 1]; \
+        X##bi ^= input[ 2]; \
+        X##bo ^= input[ 3]; \
+        X##bu ^= input[ 4]; \
+        X##ga ^= input[ 5]; \
+        X##ge ^= input[ 6]; \
+        X##gi ^= input[ 7]; \
+        X##go ^= input[ 8]; \
+        X##gu ^= input[ 9]; \
+        X##ka ^= input[10]; \
+        X##ke ^= input[11]; \
+        X##ki ^= input[12]; \
+        X##ko ^= input[13]; \
+        X##ku ^= input[14]; \
+        X##ma ^= input[15]; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                    } \
+                    else { \
+                        X##me ^= input[16]; \
+                    } \
+                } \
+                else { \
+                    X##me ^= input[16]; \
+                    X##mi ^= input[17]; \
+                    if (laneCount < 19) { \
+                    } \
+                    else { \
+                        X##mo ^= input[18]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##me ^= input[16]; \
+                X##mi ^= input[17]; \
+                X##mo ^= input[18]; \
+                X##mu ^= input[19]; \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                    } \
+                    else { \
+                        X##sa ^= input[20]; \
+                    } \
+                } \
+                else { \
+                    X##sa ^= input[20]; \
+                    X##se ^= input[21]; \
+                    if (laneCount < 23) { \
+                    } \
+                    else { \
+                        X##si ^= input[22]; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            X##me ^= input[16]; \
+            X##mi ^= input[17]; \
+            X##mo ^= input[18]; \
+            X##mu ^= input[19]; \
+            X##sa ^= input[20]; \
+            X##se ^= input[21]; \
+            X##si ^= input[22]; \
+            X##so ^= input[23]; \
+            if (laneCount < 25) { \
+            } \
+            else { \
+                X##su ^= input[24]; \
+            } \
+        } \
+    }
+
+#ifdef UseBebigokimisa
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    state[ 0] = X##ba; \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                    state[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                } \
+                else { \
+                    state[ 0] = X##ba; \
+                    output[ 0] = X##ba; \
+                    state[ 1] = X##be; \
+                    output[ 1] = ~X##be; \
+                    state[ 2] = X##bi; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = ~X##bi; \
+                    } \
+                } \
+                state[ 3] = X##bo; \
+                state[ 4] = X##bu; \
+                state[ 5] = X##ga; \
+                state[ 6] = X##ge; \
+            } \
+            else { \
+                state[ 0] = X##ba; \
+                output[ 0] = X##ba; \
+                state[ 1] = X##be; \
+                output[ 1] = ~X##be; \
+                state[ 2] = X##bi; \
+                output[ 2] = ~X##bi; \
+                state[ 3] = X##bo; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    state[ 4] = X##bu; \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                    state[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                } \
+                else { \
+                    state[ 4] = X##bu; \
+                    output[ 4] = X##bu; \
+                    state[ 5] = X##ga; \
+                    output[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+            state[ 7] = X##gi; \
+            state[ 8] = X##go; \
+            state[ 9] = X##gu; \
+            state[10] = X##ka; \
+            state[11] = X##ke; \
+            state[12] = X##ki; \
+            state[13] = X##ko; \
+            state[14] = X##ku; \
+        } \
+        else { \
+            state[ 0] = X##ba; \
+            output[ 0] = X##ba; \
+            state[ 1] = X##be; \
+            output[ 1] = ~X##be; \
+            state[ 2] = X##bi; \
+            output[ 2] = ~X##bi; \
+            state[ 3] = X##bo; \
+            output[ 3] = X##bo; \
+            state[ 4] = X##bu; \
+            output[ 4] = X##bu; \
+            state[ 5] = X##ga; \
+            output[ 5] = X##ga; \
+            state[ 6] = X##ge; \
+            output[ 6] = X##ge; \
+            state[ 7] = X##gi; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    state[ 8] = X##go; \
+                    if (laneCount >= 9) { \
+                        output[ 8] = ~X##go; \
+                    } \
+                    state[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                } \
+                else { \
+                    state[ 8] = X##go; \
+                    output[ 8] = ~X##go; \
+                    state[ 9] = X##gu; \
+                    output[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+                state[11] = X##ke; \
+                state[12] = X##ki; \
+                state[13] = X##ko; \
+                state[14] = X##ku; \
+            } \
+            else { \
+                state[ 8] = X##go; \
+                output[ 8] = ~X##go; \
+                state[ 9] = X##gu; \
+                output[ 9] = X##gu; \
+                state[10] = X##ka; \
+                output[10] = X##ka; \
+                state[11] = X##ke; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    state[12] = X##ki; \
+                    if (laneCount >= 13) { \
+                        output[12] = ~X##ki; \
+                    } \
+                    state[13] = X##ko; \
+                    state[14] = X##ku; \
+                } \
+                else { \
+                    state[12] = X##ki; \
+                    output[12] = ~X##ki; \
+                    state[13] = X##ko; \
+                    output[13] = X##ko; \
+                    state[14] = X##ku; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+        state[15] = X##ma; \
+        state[16] = X##me; \
+        state[17] = X##mi; \
+        state[18] = X##mo; \
+        state[19] = X##mu; \
+        state[20] = X##sa; \
+        state[21] = X##se; \
+        state[22] = X##si; \
+        state[23] = X##so; \
+        state[24] = X##su; \
+    } \
+    else { \
+        state[ 0] = X##ba; \
+        output[ 0] = X##ba; \
+        state[ 1] = X##be; \
+        output[ 1] = ~X##be; \
+        state[ 2] = X##bi; \
+        output[ 2] = ~X##bi; \
+        state[ 3] = X##bo; \
+        output[ 3] = X##bo; \
+        state[ 4] = X##bu; \
+        output[ 4] = X##bu; \
+        state[ 5] = X##ga; \
+        output[ 5] = X##ga; \
+        state[ 6] = X##ge; \
+        output[ 6] = X##ge; \
+        state[ 7] = X##gi; \
+        output[ 7] = X##gi; \
+        state[ 8] = X##go; \
+        output[ 8] = ~X##go; \
+        state[ 9] = X##gu; \
+        output[ 9] = X##gu; \
+        state[10] = X##ka; \
+        output[10] = X##ka; \
+        state[11] = X##ke; \
+        output[11] = X##ke; \
+        state[12] = X##ki; \
+        output[12] = ~X##ki; \
+        state[13] = X##ko; \
+        output[13] = X##ko; \
+        state[14] = X##ku; \
+        output[14] = X##ku; \
+        state[15] = X##ma; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    state[16] = X##me; \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                    state[17] = X##mi; \
+                    state[18] = X##mo; \
+                } \
+                else { \
+                    state[16] = X##me; \
+                    output[16] = X##me; \
+                    state[17] = X##mi; \
+                    output[17] = ~X##mi; \
+                    state[18] = X##mo; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+                state[19] = X##mu; \
+                state[20] = X##sa; \
+                state[21] = X##se; \
+                state[22] = X##si; \
+            } \
+            else { \
+                state[16] = X##me; \
+                output[16] = X##me; \
+                state[17] = X##mi; \
+                output[17] = ~X##mi; \
+                state[18] = X##mo; \
+                output[18] = X##mo; \
+                state[19] = X##mu; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    state[20] = X##sa; \
+                    if (laneCount >= 21) { \
+                        output[20] = ~X##sa; \
+                    } \
+                    state[21] = X##se; \
+                    state[22] = X##si; \
+                } \
+                else { \
+                    state[20] = X##sa; \
+                    output[20] = ~X##sa; \
+                    state[21] = X##se; \
+                    output[21] = X##se; \
+                    state[22] = X##si; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+            state[23] = X##so; \
+            state[24] = X##su; \
+        } \
+        else { \
+            state[16] = X##me; \
+            output[16] = X##me; \
+            state[17] = X##mi; \
+            output[17] = ~X##mi; \
+            state[18] = X##mo; \
+            output[18] = X##mo; \
+            state[19] = X##mu; \
+            output[19] = X##mu; \
+            state[20] = X##sa; \
+            output[20] = ~X##sa; \
+            state[21] = X##se; \
+            output[21] = X##se; \
+            state[22] = X##si; \
+            output[22] = X##si; \
+            state[23] = X##so; \
+            output[23] = X##so; \
+            state[24] = X##su; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define output(X, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                } \
+                else { \
+                    output[ 0] = X##ba; \
+                    output[ 1] = ~X##be; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = ~X##bi; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 0] = X##ba; \
+                output[ 1] = ~X##be; \
+                output[ 2] = ~X##bi; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                } \
+                else { \
+                    output[ 4] = X##bu; \
+                    output[ 5] = X##ga; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[ 0] = X##ba; \
+            output[ 1] = ~X##be; \
+            output[ 2] = ~X##bi; \
+            output[ 3] = X##bo; \
+            output[ 4] = X##bu; \
+            output[ 5] = X##ga; \
+            output[ 6] = X##ge; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount >= 9) { \
+                        output[ 8] = ~X##go; \
+                    } \
+                } \
+                else { \
+                    output[ 8] = ~X##go; \
+                    output[ 9] = X##gu; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 8] = ~X##go; \
+                output[ 9] = X##gu; \
+                output[10] = X##ka; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    if (laneCount >= 13) { \
+                        output[12] = ~X##ki; \
+                    } \
+                } \
+                else { \
+                    output[12] = ~X##ki; \
+                    output[13] = X##ko; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        output[ 0] = X##ba; \
+        output[ 1] = ~X##be; \
+        output[ 2] = ~X##bi; \
+        output[ 3] = X##bo; \
+        output[ 4] = X##bu; \
+        output[ 5] = X##ga; \
+        output[ 6] = X##ge; \
+        output[ 7] = X##gi; \
+        output[ 8] = ~X##go; \
+        output[ 9] = X##gu; \
+        output[10] = X##ka; \
+        output[11] = X##ke; \
+        output[12] = ~X##ki; \
+        output[13] = X##ko; \
+        output[14] = X##ku; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                } \
+                else { \
+                    output[16] = X##me; \
+                    output[17] = ~X##mi; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+            } \
+            else { \
+                output[16] = X##me; \
+                output[17] = ~X##mi; \
+                output[18] = X##mo; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    if (laneCount >= 21) { \
+                        output[20] = ~X##sa; \
+                    } \
+                } \
+                else { \
+                    output[20] = ~X##sa; \
+                    output[21] = X##se; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[16] = X##me; \
+            output[17] = ~X##mi; \
+            output[18] = X##mo; \
+            output[19] = X##mu; \
+            output[20] = ~X##sa; \
+            output[21] = X##se; \
+            output[22] = X##si; \
+            output[23] = X##so; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define wrapOne(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = ~X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+    output[index] = ~(input[index] ^ X##name); \
+    X##name ^= output[index]; \
+
+#else /* UseBebigokimisa */
+
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    state[ 0] = X##ba; \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                    state[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                } \
+                else { \
+                    state[ 0] = X##ba; \
+                    output[ 0] = X##ba; \
+                    state[ 1] = X##be; \
+                    output[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = X##bi; \
+                    } \
+                } \
+                state[ 3] = X##bo; \
+                state[ 4] = X##bu; \
+                state[ 5] = X##ga; \
+                state[ 6] = X##ge; \
+            } \
+            else { \
+                state[ 0] = X##ba; \
+                output[ 0] = X##ba; \
+                state[ 1] = X##be; \
+                output[ 1] = X##be; \
+                state[ 2] = X##bi; \
+                output[ 2] = X##bi; \
+                state[ 3] = X##bo; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    state[ 4] = X##bu; \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                    state[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                } \
+                else { \
+                    state[ 4] = X##bu; \
+                    output[ 4] = X##bu; \
+                    state[ 5] = X##ga; \
+                    output[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+            state[ 7] = X##gi; \
+            state[ 8] = X##go; \
+            state[ 9] = X##gu; \
+            state[10] = X##ka; \
+            state[11] = X##ke; \
+            state[12] = X##ki; \
+            state[13] = X##ko; \
+            state[14] = X##ku; \
+        } \
+        else { \
+            state[ 0] = X##ba; \
+            output[ 0] = X##ba; \
+            state[ 1] = X##be; \
+            output[ 1] = X##be; \
+            state[ 2] = X##bi; \
+            output[ 2] = X##bi; \
+            state[ 3] = X##bo; \
+            output[ 3] = X##bo; \
+            state[ 4] = X##bu; \
+            output[ 4] = X##bu; \
+            state[ 5] = X##ga; \
+            output[ 5] = X##ga; \
+            state[ 6] = X##ge; \
+            output[ 6] = X##ge; \
+            state[ 7] = X##gi; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    state[ 8] = X##go; \
+                    if (laneCount >= 9) { \
+                        output[ 8] = X##go; \
+                    } \
+                    state[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                } \
+                else { \
+                    state[ 8] = X##go; \
+                    output[ 8] = X##go; \
+                    state[ 9] = X##gu; \
+                    output[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+                state[11] = X##ke; \
+                state[12] = X##ki; \
+                state[13] = X##ko; \
+                state[14] = X##ku; \
+            } \
+            else { \
+                state[ 8] = X##go; \
+                output[ 8] = X##go; \
+                state[ 9] = X##gu; \
+                output[ 9] = X##gu; \
+                state[10] = X##ka; \
+                output[10] = X##ka; \
+                state[11] = X##ke; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    state[12] = X##ki; \
+                    if (laneCount >= 13) { \
+                        output[12]= X##ki; \
+                    } \
+                    state[13] = X##ko; \
+                    state[14] = X##ku; \
+                } \
+                else { \
+                    state[12] = X##ki; \
+                    output[12]= X##ki; \
+                    state[13] = X##ko; \
+                    output[13] = X##ko; \
+                    state[14] = X##ku; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+        state[15] = X##ma; \
+        state[16] = X##me; \
+        state[17] = X##mi; \
+        state[18] = X##mo; \
+        state[19] = X##mu; \
+        state[20] = X##sa; \
+        state[21] = X##se; \
+        state[22] = X##si; \
+        state[23] = X##so; \
+        state[24] = X##su; \
+    } \
+    else { \
+        state[ 0] = X##ba; \
+        output[ 0] = X##ba; \
+        state[ 1] = X##be; \
+        output[ 1] = X##be; \
+        state[ 2] = X##bi; \
+        output[ 2] = X##bi; \
+        state[ 3] = X##bo; \
+        output[ 3] = X##bo; \
+        state[ 4] = X##bu; \
+        output[ 4] = X##bu; \
+        state[ 5] = X##ga; \
+        output[ 5] = X##ga; \
+        state[ 6] = X##ge; \
+        output[ 6] = X##ge; \
+        state[ 7] = X##gi; \
+        output[ 7] = X##gi; \
+        state[ 8] = X##go; \
+        output[ 8] = X##go; \
+        state[ 9] = X##gu; \
+        output[ 9] = X##gu; \
+        state[10] = X##ka; \
+        output[10] = X##ka; \
+        state[11] = X##ke; \
+        output[11] = X##ke; \
+        state[12] = X##ki; \
+        output[12]= X##ki; \
+        state[13] = X##ko; \
+        output[13] = X##ko; \
+        state[14] = X##ku; \
+        output[14] = X##ku; \
+        state[15] = X##ma; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    state[16] = X##me; \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                    state[17] = X##mi; \
+                    state[18] = X##mo; \
+                } \
+                else { \
+                    state[16] = X##me; \
+                    output[16] = X##me; \
+                    state[17] = X##mi; \
+                    output[17] = X##mi; \
+                    state[18] = X##mo; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+                state[19] = X##mu; \
+                state[20] = X##sa; \
+                state[21] = X##se; \
+                state[22] = X##si; \
+            } \
+            else { \
+                state[16] = X##me; \
+                output[16] = X##me; \
+                state[17] = X##mi; \
+                output[17] = X##mi; \
+                state[18] = X##mo; \
+                output[18] = X##mo; \
+                state[19] = X##mu; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    state[20] = X##sa; \
+                    if (laneCount >= 21) { \
+                        output[20] = X##sa; \
+                    } \
+                    state[21] = X##se; \
+                    state[22] = X##si; \
+                } \
+                else { \
+                    state[20] = X##sa; \
+                    output[20] = X##sa; \
+                    state[21] = X##se; \
+                    output[21] = X##se; \
+                    state[22] = X##si; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+            state[23] = X##so; \
+            state[24] = X##su; \
+        } \
+        else { \
+            state[16] = X##me; \
+            output[16] = X##me; \
+            state[17] = X##mi; \
+            output[17] = X##mi; \
+            state[18] = X##mo; \
+            output[18] = X##mo; \
+            state[19] = X##mu; \
+            output[19] = X##mu; \
+            state[20] = X##sa; \
+            output[20] = X##sa; \
+            state[21] = X##se; \
+            output[21] = X##se; \
+            state[22] = X##si; \
+            output[22] = X##si; \
+            state[23] = X##so; \
+            output[23] = X##so; \
+            state[24] = X##su; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define output(X, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                } \
+                else { \
+                    output[ 0] = X##ba; \
+                    output[ 1] = X##be; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = X##bi; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 0] = X##ba; \
+                output[ 1] = X##be; \
+                output[ 2] = X##bi; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                } \
+                else { \
+                    output[ 4] = X##bu; \
+                    output[ 5] = X##ga; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[ 0] = X##ba; \
+            output[ 1] = X##be; \
+            output[ 2] = X##bi; \
+            output[ 3] = X##bo; \
+            output[ 4] = X##bu; \
+            output[ 5] = X##ga; \
+            output[ 6] = X##ge; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount >= 9) { \
+                        output[ 8] = X##go; \
+                    } \
+                } \
+                else { \
+                    output[ 8] = X##go; \
+                    output[ 9] = X##gu; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 8] = X##go; \
+                output[ 9] = X##gu; \
+                output[10] = X##ka; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    if (laneCount >= 13) { \
+                        output[12] = X##ki; \
+                    } \
+                } \
+                else { \
+                    output[12] = X##ki; \
+                    output[13] = X##ko; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        output[ 0] = X##ba; \
+        output[ 1] = X##be; \
+        output[ 2] = X##bi; \
+        output[ 3] = X##bo; \
+        output[ 4] = X##bu; \
+        output[ 5] = X##ga; \
+        output[ 6] = X##ge; \
+        output[ 7] = X##gi; \
+        output[ 8] = X##go; \
+        output[ 9] = X##gu; \
+        output[10] = X##ka; \
+        output[11] = X##ke; \
+        output[12] = X##ki; \
+        output[13] = X##ko; \
+        output[14] = X##ku; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                } \
+                else { \
+                    output[16] = X##me; \
+                    output[17] = X##mi; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+            } \
+            else { \
+                output[16] = X##me; \
+                output[17] = X##mi; \
+                output[18] = X##mo; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    if (laneCount >= 21) { \
+                        output[20] = X##sa; \
+                    } \
+                } \
+                else { \
+                    output[20] = X##sa; \
+                    output[21] = X##se; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[16] = X##me; \
+            output[17] = X##mi; \
+            output[18] = X##mo; \
+            output[19] = X##mu; \
+            output[20] = X##sa; \
+            output[21] = X##se; \
+            output[22] = X##si; \
+            output[23] = X##so; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define wrapOne(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#endif
+
+#define wrap(X, input, output, laneCount, trailingBits) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 0, ba) \
+                        X##be ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 0, ba) \
+                    wrapOneInvert(X, input, output, 1, be) \
+                    if (laneCount < 3) { \
+                        X##bi ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 2, bi) \
+                        X##bo ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOne(X, input, output, 0, ba) \
+                wrapOneInvert(X, input, output, 1, be) \
+                wrapOneInvert(X, input, output, 2, bi) \
+                wrapOne(X, input, output, 3, bo) \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 4, bu) \
+                        X##ga ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 4, bu) \
+                    wrapOne(X, input, output, 5, ga) \
+                    if (laneCount < 7) { \
+                        X##ge ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 6, ge) \
+                        X##gi ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            wrapOne(X, input, output, 0, ba) \
+            wrapOneInvert(X, input, output, 1, be) \
+            wrapOneInvert(X, input, output, 2, bi) \
+            wrapOne(X, input, output, 3, bo) \
+            wrapOne(X, input, output, 4, bu) \
+            wrapOne(X, input, output, 5, ga) \
+            wrapOne(X, input, output, 6, ge) \
+            wrapOne(X, input, output, 7, gi) \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 8, go) \
+                        X##gu ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 8, go) \
+                    wrapOne(X, input, output, 9, gu) \
+                    if (laneCount < 11) { \
+                        X##ka ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 10, ka) \
+                        X##ke ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOneInvert(X, input, output, 8, go) \
+                wrapOne(X, input, output, 9, gu) \
+                wrapOne(X, input, output, 10, ka) \
+                wrapOne(X, input, output, 11, ke) \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 12, ki) \
+                        X##ko ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 12, ki) \
+                    wrapOne(X, input, output, 13, ko) \
+                    if (laneCount < 15) { \
+                        X##ku ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 14, ku) \
+                        X##ma ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        wrapOne(X, input, output, 0, ba) \
+        wrapOneInvert(X, input, output, 1, be) \
+        wrapOneInvert(X, input, output, 2, bi) \
+        wrapOne(X, input, output, 3, bo) \
+        wrapOne(X, input, output, 4, bu) \
+        wrapOne(X, input, output, 5, ga) \
+        wrapOne(X, input, output, 6, ge) \
+        wrapOne(X, input, output, 7, gi) \
+        wrapOneInvert(X, input, output, 8, go) \
+        wrapOne(X, input, output, 9, gu) \
+        wrapOne(X, input, output, 10, ka) \
+        wrapOne(X, input, output, 11, ke) \
+        wrapOneInvert(X, input, output, 12, ki) \
+        wrapOne(X, input, output, 13, ko) \
+        wrapOne(X, input, output, 14, ku) \
+        wrapOne(X, input, output, 15, ma) \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 16, me) \
+                        X##mi ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 16, me) \
+                    wrapOneInvert(X, input, output, 17, mi) \
+                    if (laneCount < 19) { \
+                        X##mo ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 18, mo) \
+                        X##mu ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOne(X, input, output, 16, me) \
+                wrapOneInvert(X, input, output, 17, mi) \
+                wrapOne(X, input, output, 18, mo) \
+                wrapOne(X, input, output, 19, mu) \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 20, sa) \
+                        X##se ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 20, sa) \
+                    wrapOne(X, input, output, 21, se) \
+                    if (laneCount < 23) { \
+                        X##si ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 22, si) \
+                        X##so ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            wrapOne(X, input, output, 16, me) \
+            wrapOneInvert(X, input, output, 17, mi) \
+            wrapOne(X, input, output, 18, mo) \
+            wrapOne(X, input, output, 19, mu) \
+            wrapOneInvert(X, input, output, 20, sa) \
+            wrapOne(X, input, output, 21, se) \
+            wrapOne(X, input, output, 22, si) \
+            wrapOne(X, input, output, 23, so) \
+            if (laneCount < 25) { \
+                X##su ^= trailingBits; \
+            } \
+            else { \
+                wrapOne(X, input, output, 24, su) \
+            } \
+        } \
+    }
+
+#define unwrap(X, input, output, laneCount, trailingBits) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 0, ba) \
+                        X##be ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 0, ba) \
+                    unwrapOneInvert(X, input, output, 1, be) \
+                    if (laneCount < 3) { \
+                        X##bi ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 2, bi) \
+                        X##bo ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOne(X, input, output, 0, ba) \
+                unwrapOneInvert(X, input, output, 1, be) \
+                unwrapOneInvert(X, input, output, 2, bi) \
+                unwrapOne(X, input, output, 3, bo) \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 4, bu) \
+                        X##ga ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 4, bu) \
+                    unwrapOne(X, input, output, 5, ga) \
+                    if (laneCount < 7) { \
+                        X##ge ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 6, ge) \
+                        X##gi ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            unwrapOne(X, input, output, 0, ba) \
+            unwrapOneInvert(X, input, output, 1, be) \
+            unwrapOneInvert(X, input, output, 2, bi) \
+            unwrapOne(X, input, output, 3, bo) \
+            unwrapOne(X, input, output, 4, bu) \
+            unwrapOne(X, input, output, 5, ga) \
+            unwrapOne(X, input, output, 6, ge) \
+            unwrapOne(X, input, output, 7, gi) \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 8, go) \
+                        X##gu ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 8, go) \
+                    unwrapOne(X, input, output, 9, gu) \
+                    if (laneCount < 11) { \
+                        X##ka ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 10, ka) \
+                        X##ke ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOneInvert(X, input, output, 8, go) \
+                unwrapOne(X, input, output, 9, gu) \
+                unwrapOne(X, input, output, 10, ka) \
+                unwrapOne(X, input, output, 11, ke) \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 12, ki) \
+                        X##ko ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 12, ki) \
+                    unwrapOne(X, input, output, 13, ko) \
+                    if (laneCount < 15) { \
+                        X##ku ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 14, ku) \
+                        X##ma ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        unwrapOne(X, input, output, 0, ba) \
+        unwrapOneInvert(X, input, output, 1, be) \
+        unwrapOneInvert(X, input, output, 2, bi) \
+        unwrapOne(X, input, output, 3, bo) \
+        unwrapOne(X, input, output, 4, bu) \
+        unwrapOne(X, input, output, 5, ga) \
+        unwrapOne(X, input, output, 6, ge) \
+        unwrapOne(X, input, output, 7, gi) \
+        unwrapOneInvert(X, input, output, 8, go) \
+        unwrapOne(X, input, output, 9, gu) \
+        unwrapOne(X, input, output, 10, ka) \
+        unwrapOne(X, input, output, 11, ke) \
+        unwrapOneInvert(X, input, output, 12, ki) \
+        unwrapOne(X, input, output, 13, ko) \
+        unwrapOne(X, input, output, 14, ku) \
+        unwrapOne(X, input, output, 15, ma) \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 16, me) \
+                        X##mi ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 16, me) \
+                    unwrapOneInvert(X, input, output, 17, mi) \
+                    if (laneCount < 19) { \
+                        X##mo ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 18, mo) \
+                        X##mu ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOne(X, input, output, 16, me) \
+                unwrapOneInvert(X, input, output, 17, mi) \
+                unwrapOne(X, input, output, 18, mo) \
+                unwrapOne(X, input, output, 19, mu) \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 20, sa) \
+                        X##se ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 20, sa) \
+                    unwrapOne(X, input, output, 21, se) \
+                    if (laneCount < 23) { \
+                        X##si ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 22, si) \
+                        X##so ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            unwrapOne(X, input, output, 16, me) \
+            unwrapOneInvert(X, input, output, 17, mi) \
+            unwrapOne(X, input, output, 18, mo) \
+            unwrapOne(X, input, output, 19, mu) \
+            unwrapOneInvert(X, input, output, 20, sa) \
+            unwrapOne(X, input, output, 21, se) \
+            unwrapOne(X, input, output, 22, si) \
+            unwrapOne(X, input, output, 23, so) \
+            if (laneCount < 25) { \
+                X##su ^= trailingBits; \
+            } \
+            else { \
+                unwrapOne(X, input, output, 24, su) \
+            } \
+        } \
+    }
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
new file mode 100644
index 00000000000..6cf765e6ce1
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt32.h
@@ -0,0 +1,37 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+#define KeccakP1600_implementation      "in-place 32-bit optimized implementation"
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
new file mode 100644
index 00000000000..889a31a7944
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP-opt64.h
@@ -0,0 +1,49 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-opt64-config.h"
+
+#define KeccakP1600_implementation      "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")"
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakF1600_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define KeccakP1600_AddByte(state, byte, offset) \
+    ((unsigned char*)(state))[(offset)] ^= (byte)
+#else
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+#endif
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-SnP.h b/Modules/_sha3/kcp/KeccakP-1600-SnP.h
new file mode 100644
index 00000000000..0b23f09a6a4
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-SnP.h
@@ -0,0 +1,7 @@
+#if KeccakOpt == 64
+  #include "KeccakP-1600-SnP-opt64.h"
+#elif KeccakOpt == 32
+  #include "KeccakP-1600-SnP-opt32.h"
+#else
+  #error "No KeccakOpt"
+#endif
diff --git a/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
new file mode 100644
index 00000000000..886dc441971
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-inplace32BI.c
@@ -0,0 +1,1160 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include    <string.h>
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-SnP.h"
+#include "SnP-Relaned.h"
+
+typedef unsigned char UINT8;
+typedef unsigned int UINT32;
+/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */
+
+/*typedef unsigned long       UINT32; */
+
+
+#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+
+#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        temp0 = (low); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp1 = (high); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8);
+
+#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+
+#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        temp0 = (even); \
+        temp1 = (odd); \
+        temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \
+        temp0 = temp; \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1);
+
+#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        low = temp0; \
+        high = temp1;
+
+#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        lowOut = lowIn ^ temp0; \
+        highOut = highIn ^ temp1;
+
+void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
+{
+    UINT8 laneAsBytes[8];
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    memset(laneAsBytes, 0xFF, offset);
+    memset(laneAsBytes+offset, 0x00, length);
+    memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((UINT32*)(laneAsBytes+0));
+    high = *((UINT32*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((UINT32)(laneAsBytes[1]) << 8)
+        | ((UINT32)(laneAsBytes[2]) << 16)
+        | ((UINT32)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((UINT32)(laneAsBytes[5]) << 8)
+        | ((UINT32)(laneAsBytes[6]) << 16)
+        | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 200);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    unsigned int lanePosition = offset/8;
+    unsigned int offsetInLane = offset%8;
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    if (offsetInLane < 4) {
+        low = (UINT32)byte << (offsetInLane*8);
+        high = 0;
+    }
+    else {
+        low = 0;
+        high = (UINT32)byte << ((offsetInLane-4)*8);
+    }
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT8 laneAsBytes[8];
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    memset(laneAsBytes, 0, 8);
+    memcpy(laneAsBytes+offset, data, length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((UINT32*)(laneAsBytes+0));
+    high = *((UINT32*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((UINT32)(laneAsBytes[1]) << 8)
+        | ((UINT32)(laneAsBytes[2]) << 16)
+        | ((UINT32)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((UINT32)(laneAsBytes[5]) << 8)
+        | ((UINT32)(laneAsBytes[6]) << 16)
+        | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)data;
+    UINT32 * pS = (UINT32*)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        memcpy(&low, pI++, 4);
+        memcpy(&high, pI++, 4);
+        toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+        toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT8 laneAsBytes[8];
+        memcpy(laneAsBytes, data+lanePosition*8, 8);
+        UINT32 low = laneAsBytes[0]
+            | ((UINT32)(laneAsBytes[1]) << 8)
+            | ((UINT32)(laneAsBytes[2]) << 16)
+            | ((UINT32)(laneAsBytes[3]) << 24);
+        UINT32 high = laneAsBytes[4]
+            | ((UINT32)(laneAsBytes[5]) << 8)
+            | ((UINT32)(laneAsBytes[6]) << 16)
+            | ((UINT32)(laneAsBytes[7]) << 24);
+        UINT32 even, odd, temp, temp0, temp1;
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    KeccakP1600_SetBytesInLaneToZero(state, lanePosition, offset, length);
+    KeccakP1600_AddBytesInLane(state, lanePosition, data, offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)data;
+    UINT32 * pS = (UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        memcpy(&low, pI++, 4);
+        memcpy(&high, pI++, 4);
+        toBitInterleavingAndSet(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+        toBitInterleavingAndSet(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT8 laneAsBytes[8];
+        memcpy(laneAsBytes, data+lanePosition*8, 8);
+        UINT32 low = laneAsBytes[0]
+            | ((UINT32)(laneAsBytes[1]) << 8)
+            | ((UINT32)(laneAsBytes[2]) << 16)
+            | ((UINT32)(laneAsBytes[3]) << 24);
+        UINT32 high = laneAsBytes[4]
+            | ((UINT32)(laneAsBytes[5]) << 8)
+            | ((UINT32)(laneAsBytes[6]) << 16)
+            | ((UINT32)(laneAsBytes[7]) << 24);
+        UINT32 even, odd, temp, temp0, temp1;
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        toBitInterleavingAndSet(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    unsigned int i;
+
+    for(i=0; i<byteCount/8; i++) {
+        stateAsHalfLanes[i*2+0] = 0;
+        stateAsHalfLanes[i*2+1] = 0;
+    }
+    if (byteCount%8 != 0)
+        KeccakP1600_SetBytesInLaneToZero(state, byteCount/8, 0, byteCount%8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    UINT32 low, high, temp, temp0, temp1;
+    UINT8 laneAsBytes[8];
+
+    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    *((UINT32*)(laneAsBytes+0)) = low;
+    *((UINT32*)(laneAsBytes+4)) = high;
+#else
+    laneAsBytes[0] = low & 0xFF;
+    laneAsBytes[1] = (low >> 8) & 0xFF;
+    laneAsBytes[2] = (low >> 16) & 0xFF;
+    laneAsBytes[3] = (low >> 24) & 0xFF;
+    laneAsBytes[4] = high & 0xFF;
+    laneAsBytes[5] = (high >> 8) & 0xFF;
+    laneAsBytes[6] = (high >> 16) & 0xFF;
+    laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+    memcpy(data, laneAsBytes+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    UINT32 * pI = (UINT32 *)data;
+    const UINT32 * pS = ( const UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+        memcpy(pI++, &low, 4);
+        memcpy(pI++, &high, 4);
+#else
+        fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        UINT32 low, high, temp, temp0, temp1;
+        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        UINT8 laneAsBytes[8];
+        laneAsBytes[0] = low & 0xFF;
+        laneAsBytes[1] = (low >> 8) & 0xFF;
+        laneAsBytes[2] = (low >> 16) & 0xFF;
+        laneAsBytes[3] = (low >> 24) & 0xFF;
+        laneAsBytes[4] = high & 0xFF;
+        laneAsBytes[5] = (high >> 8) & 0xFF;
+        laneAsBytes[6] = (high >> 16) & 0xFF;
+        laneAsBytes[7] = (high >> 24) & 0xFF;
+        memcpy(data+lanePosition*8, laneAsBytes, 8);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    UINT32 low, high, temp, temp0, temp1;
+    UINT8 laneAsBytes[8];
+    unsigned int i;
+
+    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    *((UINT32*)(laneAsBytes+0)) = low;
+    *((UINT32*)(laneAsBytes+4)) = high;
+#else
+    laneAsBytes[0] = low & 0xFF;
+    laneAsBytes[1] = (low >> 8) & 0xFF;
+    laneAsBytes[2] = (low >> 16) & 0xFF;
+    laneAsBytes[3] = (low >> 24) & 0xFF;
+    laneAsBytes[4] = high & 0xFF;
+    laneAsBytes[5] = (high >> 8) & 0xFF;
+    laneAsBytes[6] = (high >> 16) & 0xFF;
+    laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+    for(i=0; i<length; i++)
+        output[i] = input[i] ^ laneAsBytes[offset+i];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)input;
+    UINT32 * pO = (UINT32 *)output;
+    const UINT32 * pS = (const UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+        *(pO++) = *(pI++) ^ low;
+        *(pO++) = *(pI++) ^ high;
+#else
+        fromBitInterleavingAndXOR(*(pS++), *(pS++), *(pI++), *(pI++), *(pO++), *(pO++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        UINT32 low, high, temp, temp0, temp1;
+        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        UINT8 laneAsBytes[8];
+        laneAsBytes[0] = low & 0xFF;
+        laneAsBytes[1] = (low >> 8) & 0xFF;
+        laneAsBytes[2] = (low >> 16) & 0xFF;
+        laneAsBytes[3] = (low >> 24) & 0xFF;
+        laneAsBytes[4] = high & 0xFF;
+        laneAsBytes[5] = (high >> 8) & 0xFF;
+        laneAsBytes[6] = (high >> 16) & 0xFF;
+        laneAsBytes[7] = (high >> 24) & 0xFF;
+        ((UINT32*)(output+lanePosition*8))[0] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+0));
+        ((UINT32*)(output+lanePosition*8))[1] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+4));
+    }
+#endif
+}
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
+{
+    0x00000001UL,    0x00000000UL,
+    0x00000000UL,    0x00000089UL,
+    0x00000000UL,    0x8000008bUL,
+    0x00000000UL,    0x80008080UL,
+    0x00000001UL,    0x0000008bUL,
+    0x00000001UL,    0x00008000UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000001UL,    0x80000082UL,
+    0x00000000UL,    0x0000000bUL,
+    0x00000000UL,    0x0000000aUL,
+    0x00000001UL,    0x00008082UL,
+    0x00000000UL,    0x00008003UL,
+    0x00000001UL,    0x0000808bUL,
+    0x00000001UL,    0x8000000bUL,
+    0x00000001UL,    0x8000008aUL,
+    0x00000001UL,    0x80000081UL,
+    0x00000000UL,    0x80000081UL,
+    0x00000000UL,    0x80000008UL,
+    0x00000000UL,    0x00000083UL,
+    0x00000000UL,    0x80008003UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000000UL,    0x80000088UL,
+    0x00000001UL,    0x00008000UL,
+    0x00000000UL,    0x80008082UL,
+    0x000000FFUL
+};
+
+#define KeccakAtoD_round0() \
+        Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+        Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+        Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+        Da1 = Cz^Du0; \
+\
+        Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+        Do1 = Cy^Cx; \
+\
+        Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+        De1 = Cz^Cw; \
+\
+        Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+        Di1 = Du1^Cw; \
+\
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round1() \
+        Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
+        Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
+        Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
+        Da1 = Cz^Du0; \
+\
+        Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
+        Do1 = Cy^Cx; \
+\
+        Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
+        De1 = Cz^Cw; \
+\
+        Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
+        Di1 = Du1^Cw; \
+\
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round2() \
+        Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
+        Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
+        Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
+        Da1 = Cz^Du0; \
+\
+        Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
+        Do1 = Cy^Cx; \
+\
+        Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
+        De1 = Cz^Cw; \
+\
+        Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
+        Di1 = Du1^Cw; \
+\
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+
+#define KeccakAtoD_round3() \
+        Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
+        Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
+        Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
+        Da1 = Cz^Du0; \
+\
+        Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
+        Do1 = Cy^Cx; \
+\
+        Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
+        De1 = Cz^Cw; \
+\
+        Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
+        Di1 = Du1^Cw; \
+\
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
+{
+    {
+        UINT32 Da0, De0, Di0, Do0, Du0;
+        UINT32 Da1, De1, Di1, Do1, Du1;
+        UINT32 Ca0, Ce0, Ci0, Co0, Cu0;
+        UINT32 Cx, Cy, Cz, Cw;
+        #define Ba Ca0
+        #define Be Ce0
+        #define Bi Ci0
+        #define Bo Co0
+        #define Bu Cu0
+        const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        #define Aba0 stateAsHalfLanes[ 0]
+        #define Aba1 stateAsHalfLanes[ 1]
+        #define Abe0 stateAsHalfLanes[ 2]
+        #define Abe1 stateAsHalfLanes[ 3]
+        #define Abi0 stateAsHalfLanes[ 4]
+        #define Abi1 stateAsHalfLanes[ 5]
+        #define Abo0 stateAsHalfLanes[ 6]
+        #define Abo1 stateAsHalfLanes[ 7]
+        #define Abu0 stateAsHalfLanes[ 8]
+        #define Abu1 stateAsHalfLanes[ 9]
+        #define Aga0 stateAsHalfLanes[10]
+        #define Aga1 stateAsHalfLanes[11]
+        #define Age0 stateAsHalfLanes[12]
+        #define Age1 stateAsHalfLanes[13]
+        #define Agi0 stateAsHalfLanes[14]
+        #define Agi1 stateAsHalfLanes[15]
+        #define Ago0 stateAsHalfLanes[16]
+        #define Ago1 stateAsHalfLanes[17]
+        #define Agu0 stateAsHalfLanes[18]
+        #define Agu1 stateAsHalfLanes[19]
+        #define Aka0 stateAsHalfLanes[20]
+        #define Aka1 stateAsHalfLanes[21]
+        #define Ake0 stateAsHalfLanes[22]
+        #define Ake1 stateAsHalfLanes[23]
+        #define Aki0 stateAsHalfLanes[24]
+        #define Aki1 stateAsHalfLanes[25]
+        #define Ako0 stateAsHalfLanes[26]
+        #define Ako1 stateAsHalfLanes[27]
+        #define Aku0 stateAsHalfLanes[28]
+        #define Aku1 stateAsHalfLanes[29]
+        #define Ama0 stateAsHalfLanes[30]
+        #define Ama1 stateAsHalfLanes[31]
+        #define Ame0 stateAsHalfLanes[32]
+        #define Ame1 stateAsHalfLanes[33]
+        #define Ami0 stateAsHalfLanes[34]
+        #define Ami1 stateAsHalfLanes[35]
+        #define Amo0 stateAsHalfLanes[36]
+        #define Amo1 stateAsHalfLanes[37]
+        #define Amu0 stateAsHalfLanes[38]
+        #define Amu1 stateAsHalfLanes[39]
+        #define Asa0 stateAsHalfLanes[40]
+        #define Asa1 stateAsHalfLanes[41]
+        #define Ase0 stateAsHalfLanes[42]
+        #define Ase1 stateAsHalfLanes[43]
+        #define Asi0 stateAsHalfLanes[44]
+        #define Asi1 stateAsHalfLanes[45]
+        #define Aso0 stateAsHalfLanes[46]
+        #define Aso1 stateAsHalfLanes[47]
+        #define Asu0 stateAsHalfLanes[48]
+        #define Asu1 stateAsHalfLanes[49]
+
+        do
+        {
+            /* --- Code for 4 rounds */
+
+            /* --- using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+
+            KeccakAtoD_round0();
+
+            Ba = (Aba0^Da0);
+            Be = ROL32((Age0^De0), 22);
+            Bi = ROL32((Aki1^Di1), 22);
+            Bo = ROL32((Amo1^Do1), 11);
+            Bu = ROL32((Asu0^Du0), 7);
+            Aba0 =   Ba ^((~Be)&  Bi );
+            Aba0 ^= *(pRoundConstants++);
+            Age0 =   Be ^((~Bi)&  Bo );
+            Aki1 =   Bi ^((~Bo)&  Bu );
+            Amo1 =   Bo ^((~Bu)&  Ba );
+            Asu0 =   Bu ^((~Ba)&  Be );
+
+            Ba = (Aba1^Da1);
+            Be = ROL32((Age1^De1), 22);
+            Bi = ROL32((Aki0^Di0), 21);
+            Bo = ROL32((Amo0^Do0), 10);
+            Bu = ROL32((Asu1^Du1), 7);
+            Aba1 =   Ba ^((~Be)&  Bi );
+            Aba1 ^= *(pRoundConstants++);
+            Age1 =   Be ^((~Bi)&  Bo );
+            Aki0 =   Bi ^((~Bo)&  Bu );
+            Amo0 =   Bo ^((~Bu)&  Ba );
+            Asu1 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Aka1^Da1), 2);
+            Bo = ROL32((Ame1^De1), 23);
+            Bu = ROL32((Asi1^Di1), 31);
+            Ba = ROL32((Abo0^Do0), 14);
+            Be = ROL32((Agu0^Du0), 10);
+            Aka1 =   Ba ^((~Be)&  Bi );
+            Ame1 =   Be ^((~Bi)&  Bo );
+            Asi1 =   Bi ^((~Bo)&  Bu );
+            Abo0 =   Bo ^((~Bu)&  Ba );
+            Agu0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Aka0^Da0), 1);
+            Bo = ROL32((Ame0^De0), 22);
+            Bu = ROL32((Asi0^Di0), 30);
+            Ba = ROL32((Abo1^Do1), 14);
+            Be = ROL32((Agu1^Du1), 10);
+            Aka0 =   Ba ^((~Be)&  Bi );
+            Ame0 =   Be ^((~Bi)&  Bo );
+            Asi0 =   Bi ^((~Bo)&  Bu );
+            Abo1 =   Bo ^((~Bu)&  Ba );
+            Agu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Asa0^Da0), 9);
+            Ba = ROL32((Abe1^De1), 1);
+            Be = ROL32((Agi0^Di0), 3);
+            Bi = ROL32((Ako1^Do1), 13);
+            Bo = ROL32((Amu0^Du0), 4);
+            Asa0 =   Ba ^((~Be)&  Bi );
+            Abe1 =   Be ^((~Bi)&  Bo );
+            Agi0 =   Bi ^((~Bo)&  Bu );
+            Ako1 =   Bo ^((~Bu)&  Ba );
+            Amu0 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Asa1^Da1), 9);
+            Ba = (Abe0^De0);
+            Be = ROL32((Agi1^Di1), 3);
+            Bi = ROL32((Ako0^Do0), 12);
+            Bo = ROL32((Amu1^Du1), 4);
+            Asa1 =   Ba ^((~Be)&  Bi );
+            Abe0 =   Be ^((~Bi)&  Bo );
+            Agi1 =   Bi ^((~Bo)&  Bu );
+            Ako0 =   Bo ^((~Bu)&  Ba );
+            Amu1 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Aga0^Da0), 18);
+            Bi = ROL32((Ake0^De0), 5);
+            Bo = ROL32((Ami1^Di1), 8);
+            Bu = ROL32((Aso0^Do0), 28);
+            Ba = ROL32((Abu1^Du1), 14);
+            Aga0 =   Ba ^((~Be)&  Bi );
+            Ake0 =   Be ^((~Bi)&  Bo );
+            Ami1 =   Bi ^((~Bo)&  Bu );
+            Aso0 =   Bo ^((~Bu)&  Ba );
+            Abu1 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Aga1^Da1), 18);
+            Bi = ROL32((Ake1^De1), 5);
+            Bo = ROL32((Ami0^Di0), 7);
+            Bu = ROL32((Aso1^Do1), 28);
+            Ba = ROL32((Abu0^Du0), 13);
+            Aga1 =   Ba ^((~Be)&  Bi );
+            Ake1 =   Be ^((~Bi)&  Bo );
+            Ami0 =   Bi ^((~Bo)&  Bu );
+            Aso1 =   Bo ^((~Bu)&  Ba );
+            Abu0 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Ama1^Da1), 21);
+            Bu = ROL32((Ase0^De0), 1);
+            Ba = ROL32((Abi0^Di0), 31);
+            Be = ROL32((Ago1^Do1), 28);
+            Bi = ROL32((Aku1^Du1), 20);
+            Ama1 =   Ba ^((~Be)&  Bi );
+            Ase0 =   Be ^((~Bi)&  Bo );
+            Abi0 =   Bi ^((~Bo)&  Bu );
+            Ago1 =   Bo ^((~Bu)&  Ba );
+            Aku1 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Ama0^Da0), 20);
+            Bu = ROL32((Ase1^De1), 1);
+            Ba = ROL32((Abi1^Di1), 31);
+            Be = ROL32((Ago0^Do0), 27);
+            Bi = ROL32((Aku0^Du0), 19);
+            Ama0 =   Ba ^((~Be)&  Bi );
+            Ase1 =   Be ^((~Bi)&  Bo );
+            Abi1 =   Bi ^((~Bo)&  Bu );
+            Ago0 =   Bo ^((~Bu)&  Ba );
+            Aku0 =   Bu ^((~Ba)&  Be );
+
+            KeccakAtoD_round1();
+
+            Ba = (Aba0^Da0);
+            Be = ROL32((Ame1^De0), 22);
+            Bi = ROL32((Agi1^Di1), 22);
+            Bo = ROL32((Aso1^Do1), 11);
+            Bu = ROL32((Aku1^Du0), 7);
+            Aba0 =   Ba ^((~Be)&  Bi );
+            Aba0 ^= *(pRoundConstants++);
+            Ame1 =   Be ^((~Bi)&  Bo );
+            Agi1 =   Bi ^((~Bo)&  Bu );
+            Aso1 =   Bo ^((~Bu)&  Ba );
+            Aku1 =   Bu ^((~Ba)&  Be );
+
+            Ba = (Aba1^Da1);
+            Be = ROL32((Ame0^De1), 22);
+            Bi = ROL32((Agi0^Di0), 21);
+            Bo = ROL32((Aso0^Do0), 10);
+            Bu = ROL32((Aku0^Du1), 7);
+            Aba1 =   Ba ^((~Be)&  Bi );
+            Aba1 ^= *(pRoundConstants++);
+            Ame0 =   Be ^((~Bi)&  Bo );
+            Agi0 =   Bi ^((~Bo)&  Bu );
+            Aso0 =   Bo ^((~Bu)&  Ba );
+            Aku0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Asa1^Da1), 2);
+            Bo = ROL32((Ake1^De1), 23);
+            Bu = ROL32((Abi1^Di1), 31);
+            Ba = ROL32((Amo1^Do0), 14);
+            Be = ROL32((Agu0^Du0), 10);
+            Asa1 =   Ba ^((~Be)&  Bi );
+            Ake1 =   Be ^((~Bi)&  Bo );
+            Abi1 =   Bi ^((~Bo)&  Bu );
+            Amo1 =   Bo ^((~Bu)&  Ba );
+            Agu0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Asa0^Da0), 1);
+            Bo = ROL32((Ake0^De0), 22);
+            Bu = ROL32((Abi0^Di0), 30);
+            Ba = ROL32((Amo0^Do1), 14);
+            Be = ROL32((Agu1^Du1), 10);
+            Asa0 =   Ba ^((~Be)&  Bi );
+            Ake0 =   Be ^((~Bi)&  Bo );
+            Abi0 =   Bi ^((~Bo)&  Bu );
+            Amo0 =   Bo ^((~Bu)&  Ba );
+            Agu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Ama1^Da0), 9);
+            Ba = ROL32((Age1^De1), 1);
+            Be = ROL32((Asi1^Di0), 3);
+            Bi = ROL32((Ako0^Do1), 13);
+            Bo = ROL32((Abu1^Du0), 4);
+            Ama1 =   Ba ^((~Be)&  Bi );
+            Age1 =   Be ^((~Bi)&  Bo );
+            Asi1 =   Bi ^((~Bo)&  Bu );
+            Ako0 =   Bo ^((~Bu)&  Ba );
+            Abu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Ama0^Da1), 9);
+            Ba = (Age0^De0);
+            Be = ROL32((Asi0^Di1), 3);
+            Bi = ROL32((Ako1^Do0), 12);
+            Bo = ROL32((Abu0^Du1), 4);
+            Ama0 =   Ba ^((~Be)&  Bi );
+            Age0 =   Be ^((~Bi)&  Bo );
+            Asi0 =   Bi ^((~Bo)&  Bu );
+            Ako1 =   Bo ^((~Bu)&  Ba );
+            Abu0 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Aka1^Da0), 18);
+            Bi = ROL32((Abe1^De0), 5);
+            Bo = ROL32((Ami0^Di1), 8);
+            Bu = ROL32((Ago1^Do0), 28);
+            Ba = ROL32((Asu1^Du1), 14);
+            Aka1 =   Ba ^((~Be)&  Bi );
+            Abe1 =   Be ^((~Bi)&  Bo );
+            Ami0 =   Bi ^((~Bo)&  Bu );
+            Ago1 =   Bo ^((~Bu)&  Ba );
+            Asu1 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Aka0^Da1), 18);
+            Bi = ROL32((Abe0^De1), 5);
+            Bo = ROL32((Ami1^Di0), 7);
+            Bu = ROL32((Ago0^Do1), 28);
+            Ba = ROL32((Asu0^Du0), 13);
+            Aka0 =   Ba ^((~Be)&  Bi );
+            Abe0 =   Be ^((~Bi)&  Bo );
+            Ami1 =   Bi ^((~Bo)&  Bu );
+            Ago0 =   Bo ^((~Bu)&  Ba );
+            Asu0 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Aga1^Da1), 21);
+            Bu = ROL32((Ase0^De0), 1);
+            Ba = ROL32((Aki1^Di0), 31);
+            Be = ROL32((Abo1^Do1), 28);
+            Bi = ROL32((Amu1^Du1), 20);
+            Aga1 =   Ba ^((~Be)&  Bi );
+            Ase0 =   Be ^((~Bi)&  Bo );
+            Aki1 =   Bi ^((~Bo)&  Bu );
+            Abo1 =   Bo ^((~Bu)&  Ba );
+            Amu1 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Aga0^Da0), 20);
+            Bu = ROL32((Ase1^De1), 1);
+            Ba = ROL32((Aki0^Di1), 31);
+            Be = ROL32((Abo0^Do0), 27);
+            Bi = ROL32((Amu0^Du0), 19);
+            Aga0 =   Ba ^((~Be)&  Bi );
+            Ase1 =   Be ^((~Bi)&  Bo );
+            Aki0 =   Bi ^((~Bo)&  Bu );
+            Abo0 =   Bo ^((~Bu)&  Ba );
+            Amu0 =   Bu ^((~Ba)&  Be );
+
+            KeccakAtoD_round2();
+
+            Ba = (Aba0^Da0);
+            Be = ROL32((Ake1^De0), 22);
+            Bi = ROL32((Asi0^Di1), 22);
+            Bo = ROL32((Ago0^Do1), 11);
+            Bu = ROL32((Amu1^Du0), 7);
+            Aba0 =   Ba ^((~Be)&  Bi );
+            Aba0 ^= *(pRoundConstants++);
+            Ake1 =   Be ^((~Bi)&  Bo );
+            Asi0 =   Bi ^((~Bo)&  Bu );
+            Ago0 =   Bo ^((~Bu)&  Ba );
+            Amu1 =   Bu ^((~Ba)&  Be );
+
+            Ba = (Aba1^Da1);
+            Be = ROL32((Ake0^De1), 22);
+            Bi = ROL32((Asi1^Di0), 21);
+            Bo = ROL32((Ago1^Do0), 10);
+            Bu = ROL32((Amu0^Du1), 7);
+            Aba1 =   Ba ^((~Be)&  Bi );
+            Aba1 ^= *(pRoundConstants++);
+            Ake0 =   Be ^((~Bi)&  Bo );
+            Asi1 =   Bi ^((~Bo)&  Bu );
+            Ago1 =   Bo ^((~Bu)&  Ba );
+            Amu0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Ama0^Da1), 2);
+            Bo = ROL32((Abe0^De1), 23);
+            Bu = ROL32((Aki0^Di1), 31);
+            Ba = ROL32((Aso1^Do0), 14);
+            Be = ROL32((Agu0^Du0), 10);
+            Ama0 =   Ba ^((~Be)&  Bi );
+            Abe0 =   Be ^((~Bi)&  Bo );
+            Aki0 =   Bi ^((~Bo)&  Bu );
+            Aso1 =   Bo ^((~Bu)&  Ba );
+            Agu0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Ama1^Da0), 1);
+            Bo = ROL32((Abe1^De0), 22);
+            Bu = ROL32((Aki1^Di0), 30);
+            Ba = ROL32((Aso0^Do1), 14);
+            Be = ROL32((Agu1^Du1), 10);
+            Ama1 =   Ba ^((~Be)&  Bi );
+            Abe1 =   Be ^((~Bi)&  Bo );
+            Aki1 =   Bi ^((~Bo)&  Bu );
+            Aso0 =   Bo ^((~Bu)&  Ba );
+            Agu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Aga1^Da0), 9);
+            Ba = ROL32((Ame0^De1), 1);
+            Be = ROL32((Abi1^Di0), 3);
+            Bi = ROL32((Ako1^Do1), 13);
+            Bo = ROL32((Asu1^Du0), 4);
+            Aga1 =   Ba ^((~Be)&  Bi );
+            Ame0 =   Be ^((~Bi)&  Bo );
+            Abi1 =   Bi ^((~Bo)&  Bu );
+            Ako1 =   Bo ^((~Bu)&  Ba );
+            Asu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Aga0^Da1), 9);
+            Ba = (Ame1^De0);
+            Be = ROL32((Abi0^Di1), 3);
+            Bi = ROL32((Ako0^Do0), 12);
+            Bo = ROL32((Asu0^Du1), 4);
+            Aga0 =   Ba ^((~Be)&  Bi );
+            Ame1 =   Be ^((~Bi)&  Bo );
+            Abi0 =   Bi ^((~Bo)&  Bu );
+            Ako0 =   Bo ^((~Bu)&  Ba );
+            Asu0 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Asa1^Da0), 18);
+            Bi = ROL32((Age1^De0), 5);
+            Bo = ROL32((Ami1^Di1), 8);
+            Bu = ROL32((Abo1^Do0), 28);
+            Ba = ROL32((Aku0^Du1), 14);
+            Asa1 =   Ba ^((~Be)&  Bi );
+            Age1 =   Be ^((~Bi)&  Bo );
+            Ami1 =   Bi ^((~Bo)&  Bu );
+            Abo1 =   Bo ^((~Bu)&  Ba );
+            Aku0 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Asa0^Da1), 18);
+            Bi = ROL32((Age0^De1), 5);
+            Bo = ROL32((Ami0^Di0), 7);
+            Bu = ROL32((Abo0^Do1), 28);
+            Ba = ROL32((Aku1^Du0), 13);
+            Asa0 =   Ba ^((~Be)&  Bi );
+            Age0 =   Be ^((~Bi)&  Bo );
+            Ami0 =   Bi ^((~Bo)&  Bu );
+            Abo0 =   Bo ^((~Bu)&  Ba );
+            Aku1 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Aka0^Da1), 21);
+            Bu = ROL32((Ase0^De0), 1);
+            Ba = ROL32((Agi1^Di0), 31);
+            Be = ROL32((Amo0^Do1), 28);
+            Bi = ROL32((Abu0^Du1), 20);
+            Aka0 =   Ba ^((~Be)&  Bi );
+            Ase0 =   Be ^((~Bi)&  Bo );
+            Agi1 =   Bi ^((~Bo)&  Bu );
+            Amo0 =   Bo ^((~Bu)&  Ba );
+            Abu0 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Aka1^Da0), 20);
+            Bu = ROL32((Ase1^De1), 1);
+            Ba = ROL32((Agi0^Di1), 31);
+            Be = ROL32((Amo1^Do0), 27);
+            Bi = ROL32((Abu1^Du0), 19);
+            Aka1 =   Ba ^((~Be)&  Bi );
+            Ase1 =   Be ^((~Bi)&  Bo );
+            Agi0 =   Bi ^((~Bo)&  Bu );
+            Amo1 =   Bo ^((~Bu)&  Ba );
+            Abu1 =   Bu ^((~Ba)&  Be );
+
+            KeccakAtoD_round3();
+
+            Ba = (Aba0^Da0);
+            Be = ROL32((Abe0^De0), 22);
+            Bi = ROL32((Abi0^Di1), 22);
+            Bo = ROL32((Abo0^Do1), 11);
+            Bu = ROL32((Abu0^Du0), 7);
+            Aba0 =   Ba ^((~Be)&  Bi );
+            Aba0 ^= *(pRoundConstants++);
+            Abe0 =   Be ^((~Bi)&  Bo );
+            Abi0 =   Bi ^((~Bo)&  Bu );
+            Abo0 =   Bo ^((~Bu)&  Ba );
+            Abu0 =   Bu ^((~Ba)&  Be );
+
+            Ba = (Aba1^Da1);
+            Be = ROL32((Abe1^De1), 22);
+            Bi = ROL32((Abi1^Di0), 21);
+            Bo = ROL32((Abo1^Do0), 10);
+            Bu = ROL32((Abu1^Du1), 7);
+            Aba1 =   Ba ^((~Be)&  Bi );
+            Aba1 ^= *(pRoundConstants++);
+            Abe1 =   Be ^((~Bi)&  Bo );
+            Abi1 =   Bi ^((~Bo)&  Bu );
+            Abo1 =   Bo ^((~Bu)&  Ba );
+            Abu1 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Aga0^Da1), 2);
+            Bo = ROL32((Age0^De1), 23);
+            Bu = ROL32((Agi0^Di1), 31);
+            Ba = ROL32((Ago0^Do0), 14);
+            Be = ROL32((Agu0^Du0), 10);
+            Aga0 =   Ba ^((~Be)&  Bi );
+            Age0 =   Be ^((~Bi)&  Bo );
+            Agi0 =   Bi ^((~Bo)&  Bu );
+            Ago0 =   Bo ^((~Bu)&  Ba );
+            Agu0 =   Bu ^((~Ba)&  Be );
+
+            Bi = ROL32((Aga1^Da0), 1);
+            Bo = ROL32((Age1^De0), 22);
+            Bu = ROL32((Agi1^Di0), 30);
+            Ba = ROL32((Ago1^Do1), 14);
+            Be = ROL32((Agu1^Du1), 10);
+            Aga1 =   Ba ^((~Be)&  Bi );
+            Age1 =   Be ^((~Bi)&  Bo );
+            Agi1 =   Bi ^((~Bo)&  Bu );
+            Ago1 =   Bo ^((~Bu)&  Ba );
+            Agu1 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Aka0^Da0), 9);
+            Ba = ROL32((Ake0^De1), 1);
+            Be = ROL32((Aki0^Di0), 3);
+            Bi = ROL32((Ako0^Do1), 13);
+            Bo = ROL32((Aku0^Du0), 4);
+            Aka0 =   Ba ^((~Be)&  Bi );
+            Ake0 =   Be ^((~Bi)&  Bo );
+            Aki0 =   Bi ^((~Bo)&  Bu );
+            Ako0 =   Bo ^((~Bu)&  Ba );
+            Aku0 =   Bu ^((~Ba)&  Be );
+
+            Bu = ROL32((Aka1^Da1), 9);
+            Ba = (Ake1^De0);
+            Be = ROL32((Aki1^Di1), 3);
+            Bi = ROL32((Ako1^Do0), 12);
+            Bo = ROL32((Aku1^Du1), 4);
+            Aka1 =   Ba ^((~Be)&  Bi );
+            Ake1 =   Be ^((~Bi)&  Bo );
+            Aki1 =   Bi ^((~Bo)&  Bu );
+            Ako1 =   Bo ^((~Bu)&  Ba );
+            Aku1 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Ama0^Da0), 18);
+            Bi = ROL32((Ame0^De0), 5);
+            Bo = ROL32((Ami0^Di1), 8);
+            Bu = ROL32((Amo0^Do0), 28);
+            Ba = ROL32((Amu0^Du1), 14);
+            Ama0 =   Ba ^((~Be)&  Bi );
+            Ame0 =   Be ^((~Bi)&  Bo );
+            Ami0 =   Bi ^((~Bo)&  Bu );
+            Amo0 =   Bo ^((~Bu)&  Ba );
+            Amu0 =   Bu ^((~Ba)&  Be );
+
+            Be = ROL32((Ama1^Da1), 18);
+            Bi = ROL32((Ame1^De1), 5);
+            Bo = ROL32((Ami1^Di0), 7);
+            Bu = ROL32((Amo1^Do1), 28);
+            Ba = ROL32((Amu1^Du0), 13);
+            Ama1 =   Ba ^((~Be)&  Bi );
+            Ame1 =   Be ^((~Bi)&  Bo );
+            Ami1 =   Bi ^((~Bo)&  Bu );
+            Amo1 =   Bo ^((~Bu)&  Ba );
+            Amu1 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Asa0^Da1), 21);
+            Bu = ROL32((Ase0^De0), 1);
+            Ba = ROL32((Asi0^Di0), 31);
+            Be = ROL32((Aso0^Do1), 28);
+            Bi = ROL32((Asu0^Du1), 20);
+            Asa0 =   Ba ^((~Be)&  Bi );
+            Ase0 =   Be ^((~Bi)&  Bo );
+            Asi0 =   Bi ^((~Bo)&  Bu );
+            Aso0 =   Bo ^((~Bu)&  Ba );
+            Asu0 =   Bu ^((~Ba)&  Be );
+
+            Bo = ROL32((Asa1^Da0), 20);
+            Bu = ROL32((Ase1^De1), 1);
+            Ba = ROL32((Asi1^Di1), 31);
+            Be = ROL32((Aso1^Do0), 27);
+            Bi = ROL32((Asu1^Du0), 19);
+            Asa1 =   Ba ^((~Be)&  Bi );
+            Ase1 =   Be ^((~Bi)&  Bo );
+            Asi1 =   Bi ^((~Bo)&  Bu );
+            Aso1 =   Bo ^((~Bu)&  Ba );
+            Asu1 =   Bu ^((~Ba)&  Be );
+        }
+        while ( *pRoundConstants != 0xFF );
+
+        #undef Aba0
+        #undef Aba1
+        #undef Abe0
+        #undef Abe1
+        #undef Abi0
+        #undef Abi1
+        #undef Abo0
+        #undef Abo1
+        #undef Abu0
+        #undef Abu1
+        #undef Aga0
+        #undef Aga1
+        #undef Age0
+        #undef Age1
+        #undef Agi0
+        #undef Agi1
+        #undef Ago0
+        #undef Ago1
+        #undef Agu0
+        #undef Agu1
+        #undef Aka0
+        #undef Aka1
+        #undef Ake0
+        #undef Ake1
+        #undef Aki0
+        #undef Aki1
+        #undef Ako0
+        #undef Ako1
+        #undef Aku0
+        #undef Aku1
+        #undef Ama0
+        #undef Ama1
+        #undef Ame0
+        #undef Ame1
+        #undef Ami0
+        #undef Ami1
+        #undef Amo0
+        #undef Amo1
+        #undef Amu0
+        #undef Amu1
+        #undef Asa0
+        #undef Asa1
+        #undef Ase0
+        #undef Ase1
+        #undef Asi0
+        #undef Asi1
+        #undef Aso0
+        #undef Aso1
+        #undef Asu0
+        #undef Asu1
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+     KeccakP1600_Permute_Nrounds(state, 12);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+     KeccakP1600_Permute_Nrounds(state, 24);
+}
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
new file mode 100644
index 00000000000..9501c64b186
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64-config.h
@@ -0,0 +1,3 @@
+#define KeccakP1600_implementation_config "lane complementing, all rounds unrolled"
+#define KeccakP1600_fullUnrolling
+#define KeccakP1600_useLaneComplementing
diff --git a/Modules/_sha3/kcp/KeccakP-1600-opt64.c b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
new file mode 100644
index 00000000000..c90010dd925
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-opt64.c
@@ -0,0 +1,474 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include <stdlib.h>
+/* #include "brg_endian.h" */
+#include "KeccakP-1600-opt64-config.h"
+
+#if NOT_PYTHON
+typedef unsigned char UINT8;
+/* typedef unsigned long long int UINT64; */
+#endif
+
+#if defined(KeccakP1600_useLaneComplementing)
+#define UseBebigokimisa
+#endif
+
+#if defined(_MSC_VER)
+#define ROL64(a, offset) _rotl64(a, offset)
+#elif defined(KeccakP1600_useSHLD)
+    #define ROL64(x,N) ({ \
+    register UINT64 __out; \
+    register UINT64 __in = x; \
+    __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+    __out; \
+    })
+#else
+#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+#endif
+
+#include "KeccakP-1600-64.macros"
+#ifdef KeccakP1600_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600_unrolling
+#endif
+#include "KeccakP-1600-unrolling.macros"
+#include "SnP-Relaned.h"
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 200);
+#ifdef KeccakP1600_useLaneComplementing
+    ((UINT64*)state)[ 1] = ~(UINT64)0;
+    ((UINT64*)state)[ 2] = ~(UINT64)0;
+    ((UINT64*)state)[ 8] = ~(UINT64)0;
+    ((UINT64*)state)[12] = ~(UINT64)0;
+    ((UINT64*)state)[17] = ~(UINT64)0;
+    ((UINT64*)state)[20] = ~(UINT64)0;
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    UINT64 lane;
+    if (length == 0)
+        return;
+    if (length == 1)
+        lane = data[0];
+    else {
+        lane = 0;
+        memcpy(&lane, data, length);
+    }
+    lane <<= offset*8;
+#else
+    UINT64 lane = 0;
+    unsigned int i;
+    for(i=0; i<length; i++)
+        lane |= ((UINT64)data[i]) << ((i+offset)*8);
+#endif
+    ((UINT64*)state)[lanePosition] ^= lane;
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    unsigned int i = 0;
+#ifdef NO_MISALIGNED_ACCESSES
+    /* If either pointer is misaligned, fall back to byte-wise xor. */
+
+    if (((((uintptr_t)state) & 7) != 0) || ((((uintptr_t)data) & 7) != 0)) {
+      for (i = 0; i < laneCount * 8; i++) {
+        ((unsigned char*)state)[i] ^= data[i];
+      }
+    }
+    else
+#endif
+    {
+      /* Otherwise... */
+
+      for( ; (i+8)<=laneCount; i+=8) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+          ((UINT64*)state)[i+4] ^= ((UINT64*)data)[i+4];
+          ((UINT64*)state)[i+5] ^= ((UINT64*)data)[i+5];
+          ((UINT64*)state)[i+6] ^= ((UINT64*)data)[i+6];
+          ((UINT64*)state)[i+7] ^= ((UINT64*)data)[i+7];
+      }
+      for( ; (i+4)<=laneCount; i+=4) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+      }
+      for( ; (i+2)<=laneCount; i+=2) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+      }
+      if (i<laneCount) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+      }
+    }
+#else
+    unsigned int i;
+    UINT8 *curData = data;
+    for(i=0; i<laneCount; i++, curData+=8) {
+        UINT64 lane = (UINT64)curData[0]
+            | ((UINT64)curData[1] << 8)
+            | ((UINT64)curData[2] << 16)
+            | ((UINT64)curData[3] << 24)
+            | ((UINT64)curData[4] <<32)
+            | ((UINT64)curData[5] << 40)
+            | ((UINT64)curData[6] << 48)
+            | ((UINT64)curData[7] << 56);
+        ((UINT64*)state)[i] ^= lane;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    UINT64 lane = byte;
+    lane <<= (offset%8)*8;
+    ((UINT64*)state)[offset/8] ^= lane;
+}
+#endif
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) {
+        unsigned int i;
+        for(i=0; i<length; i++)
+            ((unsigned char*)state)[lanePosition*8+offset+i] = ~data[i];
+    }
+    else
+#endif
+    {
+        memcpy((unsigned char*)state+lanePosition*8+offset, data, length);
+    }
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    unsigned int lanePosition;
+
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++)
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            ((UINT64*)state)[lanePosition] = ~((const UINT64*)data)[lanePosition];
+        else
+            ((UINT64*)state)[lanePosition] = ((const UINT64*)data)[lanePosition];
+#else
+    memcpy(state, data, laneCount*8);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    unsigned int lanePosition;
+
+    for(lanePosition=0; lanePosition<byteCount/8; lanePosition++)
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            ((UINT64*)state)[lanePosition] = ~0;
+        else
+            ((UINT64*)state)[lanePosition] = 0;
+    if (byteCount%8 != 0) {
+        lanePosition = byteCount/8;
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            memset((unsigned char*)state+lanePosition*8, 0xFF, byteCount%8);
+        else
+            memset((unsigned char*)state+lanePosition*8, 0, byteCount%8);
+    }
+#else
+    memset(state, 0, byteCount);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+
+    copyFromState(A, stateAsLanes)
+    rounds24
+    copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+
+    copyFromState(A, stateAsLanes)
+    rounds12
+    copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+        lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    {
+        UINT64 lane1[1];
+        lane1[0] = lane;
+        memcpy(data, (UINT8*)lane1+offset, length);
+    }
+#else
+    unsigned int i;
+    lane >>= offset*8;
+    for(i=0; i<length; i++) {
+        data[i] = lane & 0xFF;
+        lane >>= 8;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+{
+    unsigned int i;
+
+    for(i=0; i<(64/8); i++)
+        bytes[i] = (word >> (8*i)) & 0xFF;
+}
+#endif
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, laneCount*8);
+#else
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef KeccakP1600_useLaneComplementing
+    if (laneCount > 1) {
+        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+        lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    {
+        unsigned int i;
+        UINT64 lane1[1];
+        lane1[0] = lane;
+        for(i=0; i<length; i++)
+            output[i] = input[i] ^ ((UINT8*)lane1)[offset+i];
+    }
+#else
+    unsigned int i;
+    lane >>= offset*8;
+    for(i=0; i<length; i++) {
+        output[i] = input[i] ^ (lane & 0xFF);
+        lane >>= 8;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+    unsigned int i;
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    unsigned char temp[8];
+    unsigned int j;
+#endif
+
+    for(i=0; i<laneCount; i++) {
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+        ((UINT64*)output)[i] = ((UINT64*)input)[i] ^ ((const UINT64*)state)[i];
+#else
+        fromWordToBytes(temp, ((const UINT64*)state)[i]);
+        for(j=0; j<8; j++)
+            output[i*8+j] = input[i*8+j] ^ temp[j];
+#endif
+    }
+#ifdef KeccakP1600_useLaneComplementing
+    if (laneCount > 1) {
+        ((UINT64*)output)[ 1] = ~((UINT64*)output)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)output)[ 2] = ~((UINT64*)output)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)output)[ 8] = ~((UINT64*)output)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)output)[12] = ~((UINT64*)output)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)output)[17] = ~((UINT64*)output)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)output)[20] = ~((UINT64*)output)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    size_t originalDataByteLen = dataByteLen;
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+    UINT64 *inDataAsLanes = (UINT64*)data;
+
+    copyFromState(A, stateAsLanes)
+    while(dataByteLen >= laneCount*8) {
+        addInput(A, inDataAsLanes, laneCount)
+        rounds24
+        inDataAsLanes += laneCount;
+        dataByteLen -= laneCount*8;
+    }
+    copyToState(stateAsLanes, A)
+    return originalDataByteLen - dataByteLen;
+}
diff --git a/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
new file mode 100644
index 00000000000..405ce29724c
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakP-1600-unrolling.macros
@@ -0,0 +1,185 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.c b/Modules/_sha3/kcp/KeccakSponge.c
new file mode 100644
index 00000000000..afdb73172f3
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.c
@@ -0,0 +1,92 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakSponge.h"
+
+#ifdef KeccakReference
+    #include "displayIntermediateValues.h"
+#endif
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+
+    #define prefix KeccakWidth200
+    #define SnP KeccakP200
+    #define SnP_width 200
+    #define SnP_Permute KeccakP200_Permute_18rounds
+    #if defined(KeccakF200_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF200_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+
+    #define prefix KeccakWidth400
+    #define SnP KeccakP400
+    #define SnP_width 400
+    #define SnP_Permute KeccakP400_Permute_20rounds
+    #if defined(KeccakF400_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF400_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+
+    #define prefix KeccakWidth800
+    #define SnP KeccakP800
+    #define SnP_width 800
+    #define SnP_Permute KeccakP800_Permute_22rounds
+    #if defined(KeccakF800_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF800_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_24rounds
+    #if defined(KeccakF1600_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.h b/Modules/_sha3/kcp/KeccakSponge.h
new file mode 100644
index 00000000000..0f4badcac05
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.h
@@ -0,0 +1,172 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+/** General information
+  *
+  * The following type and functions are not actually implemented. Their
+  * documentation is generic, with the prefix Prefix replaced by
+  * - KeccakWidth200 for a sponge function based on Keccak-f[200]
+  * - KeccakWidth400 for a sponge function based on Keccak-f[400]
+  * - KeccakWidth800 for a sponge function based on Keccak-f[800]
+  * - KeccakWidth1600 for a sponge function based on Keccak-f[1600]
+  *
+  * In all these functions, the rate and capacity must sum to the width of the
+  * chosen permutation. For instance, to use the sponge function
+  * Keccak[r=1344, c=256], one must use KeccakWidth1600_Sponge() or a combination
+  * of KeccakWidth1600_SpongeInitialize(), KeccakWidth1600_SpongeAbsorb(),
+  * KeccakWidth1600_SpongeAbsorbLastFewBits() and
+  * KeccakWidth1600_SpongeSqueeze().
+  *
+  * The Prefix_SpongeInstance contains the sponge instance attributes for use
+  * with the Prefix_Sponge* functions.
+  * It gathers the state processed by the permutation as well as the rate,
+  * the position of input/output bytes in the state and the phase
+  * (absorbing or squeezing).
+  */
+
+#ifdef DontReallyInclude_DocumentationOnly
+/** Function to evaluate the sponge function Keccak[r, c] in a single call.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  input           Pointer to the input message (before the suffix).
+  * @param  inputByteLen    The length of the input message in bytes.
+  * @param  suffix          Byte containing from 0 to 7 suffix bits
+  *                         that must be absorbed after @a input.
+  *                         These <i>n</i> bits must be in the least significant bit positions.
+  *                         These bits must be delimited with a bit 1 at position <i>n</i>
+  *                         (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                         from position <i>n</i>+1 to position 7.
+  *                         Some examples:
+  *                             - If no bits are to be absorbed, then @a suffix must be 0x01.
+  *                             - If the 2-bit sequence 0,0 is to be absorbed, @a suffix must be 0x04.
+  *                             - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a suffix must be 0x32.
+  *                             - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a suffix must be 0x8B.
+  *                         .
+  * @param  output          Pointer to the output buffer.
+  * @param  outputByteLen   The desired number of output bytes.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @pre    @a suffix ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen);
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The phase of the sponge function is set to absorbing.
+  * @param  spongeInstance  Pointer to the sponge instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeInitialize(Prefix_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity);
+
+/**
+  * Function to give input data bytes for the sponge function to absorb.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the input data.
+  * @param  dataByteLen  The number of input bytes provided in the input data.
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorb(Prefix_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen);
+
+/**
+  * Function to give input data bits for the sponge function to absorb
+  * and then to switch to the squeezing phase.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  delimitedData   Byte containing from 0 to 7 trailing bits
+  *                     that must be absorbed.
+  *                     These <i>n</i> bits must be in the least significant bit positions.
+  *                     These bits must be delimited with a bit 1 at position <i>n</i>
+  *                     (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                     from position <i>n</i>+1 to position 7.
+  *                     Some examples:
+  *                         - If no bits are to be absorbed, then @a delimitedData must be 0x01.
+  *                         - If the 2-bit sequence 0,0 is to be absorbed, @a delimitedData must be 0x04.
+  *                         - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a delimitedData must be 0x32.
+  *                         - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedData must be 0x8B.
+  *                     .
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @pre    @a delimitedData ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorbLastFewBits(Prefix_SpongeInstance *spongeInstance, unsigned char delimitedData);
+
+/**
+  * Function to squeeze output data from the sponge function.
+  * If the sponge function was in the absorbing phase, this function
+  * switches it to the squeezing phase
+  * as if Prefix_SpongeAbsorbLastFewBits(spongeInstance, 0x01) was called.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  dataByteLen The number of output bytes desired.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+#endif
+
+#include <string.h>
+#include "align.h"
+
+#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+    ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
+        unsigned char state[size]; \
+        unsigned int rate; \
+        unsigned int byteIOIndex; \
+        int squeezing; \
+    } prefix##_SpongeInstance;
+
+#define KCP_DeclareSpongeFunctions(prefix) \
+    int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
+    int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
+    int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
+    int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
+    int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth200)
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth400)
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth800)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600)
+#endif
+
+#endif
diff --git a/Modules/_sha3/kcp/KeccakSponge.inc b/Modules/_sha3/kcp/KeccakSponge.inc
new file mode 100644
index 00000000000..e10739deafa
--- /dev/null
+++ b/Modules/_sha3/kcp/KeccakSponge.inc
@@ -0,0 +1,332 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define JOIN0(a, b)                     a ## b
+#define JOIN(a, b)                      JOIN0(a, b)
+
+#define Sponge                          JOIN(prefix, _Sponge)
+#define SpongeInstance                  JOIN(prefix, _SpongeInstance)
+#define SpongeInitialize                JOIN(prefix, _SpongeInitialize)
+#define SpongeAbsorb                    JOIN(prefix, _SpongeAbsorb)
+#define SpongeAbsorbLastFewBits         JOIN(prefix, _SpongeAbsorbLastFewBits)
+#define SpongeSqueeze                   JOIN(prefix, _SpongeSqueeze)
+
+#define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
+#define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
+#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize                  JOIN(SnP, _Initialize)
+#define SnP_AddByte                     JOIN(SnP, _AddByte)
+#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
+#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
+
+int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen)
+{
+    ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes];
+    unsigned int partialBlock;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    unsigned int rateInBytes = rate/8;
+
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    if (suffix == 0)
+        return 1;
+
+    /* Initialize the state */
+
+    SnP_StaticInitialize();
+    SnP_Initialize(state);
+
+    /* First, absorb whole blocks */
+
+#ifdef SnP_FastLoop_Absorb
+    if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
+        /* fast lane: whole lane rate */
+
+        size_t j;
+        j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
+        curInput += j;
+        inputByteLen -= j;
+    }
+#endif
+    while(inputByteLen >= (size_t)rateInBytes) {
+        #ifdef KeccakReference
+        displayBytes(1, "Block to be absorbed", curInput, rateInBytes);
+        #endif
+        SnP_AddBytes(state, curInput, 0, rateInBytes);
+        SnP_Permute(state);
+        curInput += rateInBytes;
+        inputByteLen -= rateInBytes;
+    }
+
+    /* Then, absorb what remains */
+
+    partialBlock = (unsigned int)inputByteLen;
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
+    #endif
+    SnP_AddBytes(state, curInput, 0, partialBlock);
+
+    /* Finally, absorb the suffix */
+
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = suffix;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+
+    SnP_AddByte(state, suffix, partialBlock);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+
+    if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
+        SnP_Permute(state);
+    /* Second bit of padding */
+
+    SnP_AddByte(state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(state);
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+
+    /* First, output whole blocks */
+
+    while(outputByteLen > (size_t)rateInBytes) {
+        SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
+        SnP_Permute(state);
+        #ifdef KeccakReference
+        displayBytes(1, "Squeezed block", curOutput, rateInBytes);
+        #endif
+        curOutput += rateInBytes;
+        outputByteLen -= rateInBytes;
+    }
+
+    /* Finally, output what remains */
+
+    partialBlock = (unsigned int)outputByteLen;
+    SnP_ExtractBytes(state, curOutput, 0, partialBlock);
+    #ifdef KeccakReference
+    displayBytes(1, "Squeezed block (part)", curOutput, partialBlock);
+    #endif
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+
+int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    SnP_StaticInitialize();
+    SnP_Initialize(instance->state);
+    instance->rate = rate;
+    instance->byteIOIndex = 0;
+    instance->squeezing = 0;
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    const unsigned char *curData;
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
+#ifdef SnP_FastLoop_Absorb
+            /* processing full blocks first */
+
+            if ((rateInBytes % (SnP_width/200)) == 0) {
+                /* fast lane: whole lane rate */
+
+                j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
+                i += j;
+                curData += j;
+            }
+            else {
+#endif
+                for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, rateInBytes);
+                    #endif
+                    SnP_AddBytes(instance->state, curData, 0, rateInBytes);
+                    SnP_Permute(instance->state);
+                    curData+=rateInBytes;
+                }
+                i = dataByteLen - j;
+#ifdef SnP_FastLoop_Absorb
+            }
+#endif
+        }
+        else {
+            /* normal lane: using the message queue */
+
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            #ifdef KeccakReference
+            displayBytes(1, "Block to be absorbed (part)", curData, partialBlock);
+            #endif
+            i += partialBlock;
+
+            SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData)
+{
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (delimitedData == 0)
+        return 1;
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = delimitedData;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+
+    SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+
+    if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+        SnP_Permute(instance->state);
+    /* Second bit of padding */
+
+    SnP_AddByte(instance->state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 1;
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    unsigned int rateInBytes = instance->rate/8;
+    unsigned char *curData;
+
+    if (!instance->squeezing)
+        SpongeAbsorbLastFewBits(instance, 0x01);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) {
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                SnP_Permute(instance->state);
+                SnP_ExtractBytes(instance->state, curData, 0, rateInBytes);
+                #ifdef KeccakReference
+                displayBytes(1, "Squeezed block", curData, rateInBytes);
+                #endif
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        }
+        else {
+            /* normal lane: using the message queue */
+
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            i += partialBlock;
+
+            SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            #ifdef KeccakReference
+            displayBytes(1, "Squeezed block (part)", curData, partialBlock);
+            #endif
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#undef Sponge
+#undef SpongeInstance
+#undef SpongeInitialize
+#undef SpongeAbsorb
+#undef SpongeAbsorbLastFewBits
+#undef SpongeSqueeze
+#undef SnP_stateSizeInBytes
+#undef SnP_stateAlignment
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_ExtractBytes
diff --git a/Modules/_sha3/kcp/PlSnP-Fallback.inc b/Modules/_sha3/kcp/PlSnP-Fallback.inc
new file mode 100644
index 00000000000..3a9119ab4b6
--- /dev/null
+++ b/Modules/_sha3/kcp/PlSnP-Fallback.inc
@@ -0,0 +1,257 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+/* expect PlSnP_baseParallelism, PlSnP_targetParallelism */
+
+/* expect SnP_stateSizeInBytes, SnP_stateAlignment */
+
+/* expect prefix */
+
+/* expect SnP_* */
+
+
+#define JOIN0(a, b)                     a ## b
+#define JOIN(a, b)                      JOIN0(a, b)
+
+#define PlSnP_StaticInitialize          JOIN(prefix, _StaticInitialize)
+#define PlSnP_InitializeAll             JOIN(prefix, _InitializeAll)
+#define PlSnP_AddByte                   JOIN(prefix, _AddByte)
+#define PlSnP_AddBytes                  JOIN(prefix, _AddBytes)
+#define PlSnP_AddLanesAll               JOIN(prefix, _AddLanesAll)
+#define PlSnP_OverwriteBytes            JOIN(prefix, _OverwriteBytes)
+#define PlSnP_OverwriteLanesAll         JOIN(prefix, _OverwriteLanesAll)
+#define PlSnP_OverwriteWithZeroes       JOIN(prefix, _OverwriteWithZeroes)
+#define PlSnP_ExtractBytes              JOIN(prefix, _ExtractBytes)
+#define PlSnP_ExtractLanesAll           JOIN(prefix, _ExtractLanesAll)
+#define PlSnP_ExtractAndAddBytes        JOIN(prefix, _ExtractAndAddBytes)
+#define PlSnP_ExtractAndAddLanesAll     JOIN(prefix, _ExtractAndAddLanesAll)
+
+#if (PlSnP_baseParallelism == 1)
+    #define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
+    #define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
+#else
+    #define SnP_stateSizeInBytes            JOIN(SnP, _statesSizeInBytes)
+    #define SnP_stateAlignment              JOIN(SnP, _statesAlignment)
+#endif
+#define PlSnP_factor ((PlSnP_targetParallelism)/(PlSnP_baseParallelism))
+#define SnP_stateOffset (((SnP_stateSizeInBytes+(SnP_stateAlignment-1))/SnP_stateAlignment)*SnP_stateAlignment)
+#define stateWithIndex(i) ((unsigned char *)states+((i)*SnP_stateOffset))
+
+#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize                  JOIN(SnP, _Initialize)
+#define SnP_InitializeAll               JOIN(SnP, _InitializeAll)
+#define SnP_AddByte                     JOIN(SnP, _AddByte)
+#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
+#define SnP_AddLanesAll                 JOIN(SnP, _AddLanesAll)
+#define SnP_OverwriteBytes              JOIN(SnP, _OverwriteBytes)
+#define SnP_OverwriteLanesAll           JOIN(SnP, _OverwriteLanesAll)
+#define SnP_OverwriteWithZeroes         JOIN(SnP, _OverwriteWithZeroes)
+#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
+#define SnP_ExtractLanesAll             JOIN(SnP, _ExtractLanesAll)
+#define SnP_ExtractAndAddBytes          JOIN(SnP, _ExtractAndAddBytes)
+#define SnP_ExtractAndAddLanesAll       JOIN(SnP, _ExtractAndAddLanesAll)
+
+void PlSnP_StaticInitialize( void )
+{
+    SnP_StaticInitialize();
+}
+
+void PlSnP_InitializeAll(void *states)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++)
+    #if (PlSnP_baseParallelism == 1)
+        SnP_Initialize(stateWithIndex(i));
+    #else
+        SnP_InitializeAll(stateWithIndex(i));
+    #endif
+}
+
+void PlSnP_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_AddByte(stateWithIndex(instanceIndex), byte, offset);
+    #else
+        SnP_AddByte(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byte, offset);
+    #endif
+}
+
+void PlSnP_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_AddBytes(stateWithIndex(instanceIndex), data, offset, length);
+    #else
+        SnP_AddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+    #endif
+}
+
+void PlSnP_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_AddBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+        #else
+            SnP_AddLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+        #endif
+        data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
+    }
+}
+
+void PlSnP_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_OverwriteBytes(stateWithIndex(instanceIndex), data, offset, length);
+    #else
+        SnP_OverwriteBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+    #endif
+}
+
+void PlSnP_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_OverwriteBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+        #else
+            SnP_OverwriteLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+        #endif
+        data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
+    }
+}
+
+void PlSnP_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex), byteCount);
+    #else
+        SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byteCount);
+    #endif
+}
+
+void PlSnP_PermuteAll(void *states)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_Permute(stateWithIndex(i));
+        #else
+            SnP_PermuteAll(stateWithIndex(i));
+        #endif
+    }
+}
+
+#if (defined(SnP_Permute_12rounds) || defined(SnP_PermuteAll_12rounds))
+void PlSnP_PermuteAll_12rounds(void *states)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_Permute_12rounds(stateWithIndex(i));
+        #else
+            SnP_PermuteAll_12rounds(stateWithIndex(i));
+        #endif
+    }
+}
+#endif
+
+void PlSnP_ExtractBytes(void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_ExtractBytes(stateWithIndex(instanceIndex), data, offset, length);
+    #else
+        SnP_ExtractBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
+    #endif
+}
+
+void PlSnP_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_ExtractBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
+        #else
+            SnP_ExtractLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
+        #endif
+        data += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+    }
+}
+
+void PlSnP_ExtractAndAddBytes(void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    #if (PlSnP_baseParallelism == 1)
+        SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex), input, output, offset, length);
+    #else
+        SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, input, output, offset, length);
+    #endif
+}
+
+void PlSnP_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
+{
+    unsigned int i;
+
+    for(i=0; i<PlSnP_factor; i++) {
+        #if (PlSnP_baseParallelism == 1)
+            SnP_ExtractAndAddBytes(stateWithIndex(i), input, output, 0, laneCount*SnP_laneLengthInBytes);
+        #else
+            SnP_ExtractAndAddLanesAll(stateWithIndex(i), input, output, laneCount, laneOffset);
+        #endif
+        input += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+        output += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
+    }
+}
+
+#undef PlSnP_factor
+#undef SnP_stateOffset
+#undef stateWithIndex
+#undef JOIN0
+#undef JOIN
+#undef PlSnP_StaticInitialize
+#undef PlSnP_InitializeAll
+#undef PlSnP_AddByte
+#undef PlSnP_AddBytes
+#undef PlSnP_AddLanesAll
+#undef PlSnP_OverwriteBytes
+#undef PlSnP_OverwriteLanesAll
+#undef PlSnP_OverwriteWithZeroes
+#undef PlSnP_PermuteAll
+#undef PlSnP_ExtractBytes
+#undef PlSnP_ExtractLanesAll
+#undef PlSnP_ExtractAndAddBytes
+#undef PlSnP_ExtractAndAddLanesAll
+#undef SnP_stateAlignment
+#undef SnP_stateSizeInBytes
+#undef PlSnP_factor
+#undef SnP_stateOffset
+#undef stateWithIndex
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_InitializeAll
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_AddLanesAll
+#undef SnP_OverwriteBytes
+#undef SnP_OverwriteWithZeroes
+#undef SnP_OverwriteLanesAll
+#undef SnP_ExtractBytes
+#undef SnP_ExtractLanesAll
+#undef SnP_ExtractAndAddBytes
+#undef SnP_ExtractAndAddLanesAll
diff --git a/Modules/_sha3/kcp/SnP-Relaned.h b/Modules/_sha3/kcp/SnP-Relaned.h
new file mode 100644
index 00000000000..086e635ff8f
--- /dev/null
+++ b/Modules/_sha3/kcp/SnP-Relaned.h
@@ -0,0 +1,134 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _SnP_Relaned_h_
+#define _SnP_Relaned_h_
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_OverwriteBytes(state, data, offset, length, SnP_OverwriteLanes, SnP_OverwriteBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_OverwriteLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_OverwriteBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_OverwriteBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractAndAddBytes(state, input, output, offset, length, SnP_ExtractAndAddLanes, SnP_ExtractAndAddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractAndAddLanes(state, input, output, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractAndAddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (input)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                (output)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curInput = (input); \
+            unsigned char *_curOutput = (output); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractAndAddBytesInLane(state, _lanePosition, _curInput, _curOutput, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curInput += _bytesInLane; \
+                _curOutput += _bytesInLane; \
+            } \
+        } \
+    }
+
+#endif
diff --git a/Modules/_sha3/kcp/align.h b/Modules/_sha3/kcp/align.h
new file mode 100644
index 00000000000..6650fe8c3cf
--- /dev/null
+++ b/Modules/_sha3/kcp/align.h
@@ -0,0 +1,35 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
diff --git a/Modules/_sha3/sha3module.c b/Modules/_sha3/sha3module.c
new file mode 100644
index 00000000000..67c69f2c85a
--- /dev/null
+++ b/Modules/_sha3/sha3module.c
@@ -0,0 +1,749 @@
+/* SHA3 module
+ *
+ * This module provides an interface to the SHA3 algorithm
+ *
+ * See below for information about the original code this module was
+ * based upon. Additional work performed by:
+ *
+ *  Andrew Kuchling (amk@amk.ca)
+ *  Greg Stein (gstein@lyra.org)
+ *  Trevor Perrin (trevp@trevp.net)
+ *  Gregory P. Smith (greg@krypto.org)
+ *
+ * Copyright (C) 2012-2016  Christian Heimes (christian@python.org)
+ * Licensed to PSF under a Contributor Agreement.
+ *
+ */
+
+#include "Python.h"
+#include "pystrhex.h"
+#include "../hashlib.h"
+
+/* **************************************************************************
+ *                          SHA-3 (Keccak) and SHAKE
+ *
+ * The code is based on KeccakCodePackage from 2016-04-23
+ * commit 647f93079afc4ada3d23737477a6e52511ca41fd
+ *
+ * The reference implementation is altered in this points:
+ *  - C++ comments are converted to ANSI C comments.
+ *  - all function names are mangled
+ *  - typedef for UINT64 is commented out.
+ *  - brg_endian.h is removed
+ *
+ * *************************************************************************/
+
+#ifdef __sparc
+  /* opt64 uses un-aligned memory access that causes a BUS error with msg
+   * 'invalid address alignment' on SPARC. */
+  #define KeccakOpt 32
+#elif SIZEOF_VOID_P == 8 && defined(PY_UINT64_T)
+  /* opt64 works only for 64bit platforms with unsigned int64 */
+  #define KeccakOpt 64
+#else
+  /* opt32 is used for the remaining 32 and 64bit platforms */
+  #define KeccakOpt 32
+#endif
+
+#if KeccakOpt == 64 && defined(PY_UINT64_T)
+  /* 64bit platforms with unsigned int64 */
+  typedef PY_UINT64_T UINT64;
+  typedef unsigned char UINT8;
+#endif
+
+/* replacement for brg_endian.h */
+#define IS_LITTLE_ENDIAN 1234
+#define IS_BIG_ENDIAN 4321
+#if PY_LITTLE_ENDIAN
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+#if PY_BIG_ENDIAN
+#define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#endif
+
+/* mangle names */
+#define KeccakF1600_FastLoop_Absorb _PySHA3_KeccakF1600_FastLoop_Absorb
+#define Keccak_HashFinal _PySHA3_Keccak_HashFinal
+#define Keccak_HashInitialize _PySHA3_Keccak_HashInitialize
+#define Keccak_HashSqueeze _PySHA3_Keccak_HashSqueeze
+#define Keccak_HashUpdate _PySHA3_Keccak_HashUpdate
+#define KeccakP1600_AddBytes _PySHA3_KeccakP1600_AddBytes
+#define KeccakP1600_AddBytesInLane _PySHA3_KeccakP1600_AddBytesInLane
+#define KeccakP1600_AddLanes _PySHA3_KeccakP1600_AddLanes
+#define KeccakP1600_ExtractAndAddBytes _PySHA3_KeccakP1600_ExtractAndAddBytes
+#define KeccakP1600_ExtractAndAddBytesInLane _PySHA3_KeccakP1600_ExtractAndAddBytesInLane
+#define KeccakP1600_ExtractAndAddLanes _PySHA3_KeccakP1600_ExtractAndAddLanes
+#define KeccakP1600_ExtractBytes _PySHA3_KeccakP1600_ExtractBytes
+#define KeccakP1600_ExtractBytesInLane _PySHA3_KeccakP1600_ExtractBytesInLane
+#define KeccakP1600_ExtractLanes _PySHA3_KeccakP1600_ExtractLanes
+#define KeccakP1600_Initialize _PySHA3_KeccakP1600_Initialize
+#define KeccakP1600_OverwriteBytes _PySHA3_KeccakP1600_OverwriteBytes
+#define KeccakP1600_OverwriteBytesInLane _PySHA3_KeccakP1600_OverwriteBytesInLane
+#define KeccakP1600_OverwriteLanes _PySHA3_KeccakP1600_OverwriteLanes
+#define KeccakP1600_OverwriteWithZeroes _PySHA3_KeccakP1600_OverwriteWithZeroes
+#define KeccakP1600_Permute_12rounds _PySHA3_KeccakP1600_Permute_12rounds
+#define KeccakP1600_Permute_24rounds _PySHA3_KeccakP1600_Permute_24rounds
+#define KeccakWidth1600_Sponge _PySHA3_KeccakWidth1600_Sponge
+#define KeccakWidth1600_SpongeAbsorb _PySHA3_KeccakWidth1600_SpongeAbsorb
+#define KeccakWidth1600_SpongeAbsorbLastFewBits _PySHA3_KeccakWidth1600_SpongeAbsorbLastFewBits
+#define KeccakWidth1600_SpongeInitialize _PySHA3_KeccakWidth1600_SpongeInitialize
+#define KeccakWidth1600_SpongeSqueeze _PySHA3_KeccakWidth1600_SpongeSqueeze
+#if KeccakOpt == 32
+#define KeccakP1600_AddByte _PySHA3_KeccakP1600_AddByte
+#define KeccakP1600_Permute_Nrounds _PySHA3_KeccakP1600_Permute_Nrounds
+#define KeccakP1600_SetBytesInLaneToZero _PySHA3_KeccakP1600_SetBytesInLaneToZero
+#endif
+
+/* we are only interested in KeccakP1600 */
+#define KeccakP200_excluded 1
+#define KeccakP400_excluded 1
+#define KeccakP800_excluded 1
+
+/* inline all Keccak dependencies */
+#include "kcp/KeccakHash.h"
+#include "kcp/KeccakSponge.h"
+#include "kcp/KeccakHash.c"
+#include "kcp/KeccakSponge.c"
+#if KeccakOpt == 64
+  #include "kcp/KeccakP-1600-opt64.c"
+#elif KeccakOpt == 32
+  #include "kcp/KeccakP-1600-inplace32BI.c"
+#endif
+
+#define SHA3_MAX_DIGESTSIZE 64 /* 64 Bytes (512 Bits) for 224 to 512 */
+#define SHA3_state Keccak_HashInstance
+#define SHA3_init Keccak_HashInitialize
+#define SHA3_process Keccak_HashUpdate
+#define SHA3_done Keccak_HashFinal
+#define SHA3_squeeze Keccak_HashSqueeze
+#define SHA3_copystate(dest, src) memcpy(&(dest), &(src), sizeof(SHA3_state))
+
+
+/*[clinic input]
+module _sha3
+class _sha3.sha3_224 "SHA3object *" "&SHA3_224typ"
+class _sha3.sha3_256 "SHA3object *" "&SHA3_256typ"
+class _sha3.sha3_384 "SHA3object *" "&SHA3_384typ"
+class _sha3.sha3_512 "SHA3object *" "&SHA3_512typ"
+class _sha3.shake_128 "SHA3object *" "&SHAKE128type"
+class _sha3.shake_256 "SHA3object *" "&SHAKE256type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=b8a53680f370285a]*/
+
+/* The structure for storing SHA3 info */
+
+#define PY_WITH_KECCAK 0
+
+typedef struct {
+    PyObject_HEAD
+    SHA3_state hash_state;
+#ifdef WITH_THREAD
+    PyThread_type_lock lock;
+#endif
+} SHA3object;
+
+static PyTypeObject SHA3_224type;
+static PyTypeObject SHA3_256type;
+static PyTypeObject SHA3_384type;
+static PyTypeObject SHA3_512type;
+#ifdef PY_WITH_KECCAK
+static PyTypeObject Keccak_224type;
+static PyTypeObject Keccak_256type;
+static PyTypeObject Keccak_384type;
+static PyTypeObject Keccak_512type;
+#endif
+static PyTypeObject SHAKE128type;
+static PyTypeObject SHAKE256type;
+
+#include "clinic/sha3module.c.h"
+
+static SHA3object *
+newSHA3object(PyTypeObject *type)
+{
+    SHA3object *newobj;
+    newobj = (SHA3object *)PyObject_New(SHA3object, type);
+    if (newobj == NULL) {
+        return NULL;
+    }
+#ifdef WITH_THREAD
+    newobj->lock = NULL;
+#endif
+    return newobj;
+}
+
+
+/*[clinic input]
+@classmethod
+_sha3.sha3_224.__new__ as py_sha3_new
+    string as data: object = NULL
+
+Return a new SHA3 hash object with a hashbit length of 28 bytes.
+[clinic start generated code]*/
+
+static PyObject *
+py_sha3_new_impl(PyTypeObject *type, PyObject *data)
+/*[clinic end generated code: output=8d5c34279e69bf09 input=d7c582b950a858b6]*/
+{
+    SHA3object *self = NULL;
+    Py_buffer buf = {NULL, NULL};
+    HashReturn res;
+
+    self = newSHA3object(type);
+    if (self == NULL) {
+        goto error;
+    }
+
+    if (type == &SHA3_224type) {
+        res = Keccak_HashInitialize_SHA3_224(&self->hash_state);
+    } else if (type == &SHA3_256type) {
+        res = Keccak_HashInitialize_SHA3_256(&self->hash_state);
+    } else if (type == &SHA3_384type) {
+        res = Keccak_HashInitialize_SHA3_384(&self->hash_state);
+    } else if (type == &SHA3_512type) {
+        res = Keccak_HashInitialize_SHA3_512(&self->hash_state);
+#ifdef PY_WITH_KECCAK
+    } else if (type == &Keccak_224type) {
+        res = Keccak_HashInitialize(&self->hash_state, 1152, 448, 224, 0x01);
+    } else if (type == &Keccak_256type) {
+        res = Keccak_HashInitialize(&self->hash_state, 1088, 512, 256, 0x01);
+    } else if (type == &Keccak_384type) {
+        res = Keccak_HashInitialize(&self->hash_state, 832, 768, 384, 0x01);
+    } else if (type == &Keccak_512type) {
+        res = Keccak_HashInitialize(&self->hash_state, 576, 1024, 512, 0x01);
+#endif
+    } else if (type == &SHAKE128type) {
+        res = Keccak_HashInitialize_SHAKE128(&self->hash_state);
+    } else if (type == &SHAKE256type) {
+        res = Keccak_HashInitialize_SHAKE256(&self->hash_state);
+    } else {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    if (data) {
+        GET_BUFFER_VIEW_OR_ERROR(data, &buf, goto error);
+#ifdef WITH_THREAD
+        if (buf.len >= HASHLIB_GIL_MINSIZE) {
+            /* invariant: New objects can't be accessed by other code yet,
+             * thus it's safe to release the GIL without locking the object.
+             */
+            Py_BEGIN_ALLOW_THREADS
+            res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+            Py_END_ALLOW_THREADS
+        }
+        else {
+            res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+        }
+#else
+        res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+#endif
+        if (res != SUCCESS) {
+            PyErr_SetString(PyExc_RuntimeError,
+                            "internal error in SHA3 Update()");
+            goto error;
+        }
+        PyBuffer_Release(&buf);
+    }
+
+    return (PyObject *)self;
+
+  error:
+    if (self) {
+        Py_DECREF(self);
+    }
+    if (data && buf.obj) {
+        PyBuffer_Release(&buf);
+    }
+    return NULL;
+}
+
+
+/* Internal methods for a hash object */
+
+static void
+SHA3_dealloc(SHA3object *self)
+{
+#ifdef WITH_THREAD
+    if (self->lock) {
+        PyThread_free_lock(self->lock);
+    }
+#endif
+    PyObject_Del(self);
+}
+
+
+/* External methods for a hash object */
+
+
+/*[clinic input]
+_sha3.sha3_224.copy
+
+Return a copy of the hash object.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_sha3_224_copy_impl(SHA3object *self)
+/*[clinic end generated code: output=6c537411ecdcda4c input=93a44aaebea51ba8]*/
+{
+    SHA3object *newobj;
+
+    if ((newobj = newSHA3object(Py_TYPE(self))) == NULL) {
+        return NULL;
+    }
+    ENTER_HASHLIB(self);
+    SHA3_copystate(newobj->hash_state, self->hash_state);
+    LEAVE_HASHLIB(self);
+    return (PyObject *)newobj;
+}
+
+
+/*[clinic input]
+_sha3.sha3_224.digest
+
+Return the digest value as a string of binary data.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_sha3_224_digest_impl(SHA3object *self)
+/*[clinic end generated code: output=fd531842e20b2d5b input=a5807917d219b30e]*/
+{
+    unsigned char digest[SHA3_MAX_DIGESTSIZE];
+    SHA3_state temp;
+    HashReturn res;
+
+    ENTER_HASHLIB(self);
+    SHA3_copystate(temp, self->hash_state);
+    LEAVE_HASHLIB(self);
+    res = SHA3_done(&temp, digest);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
+        return NULL;
+    }
+    return PyBytes_FromStringAndSize((const char *)digest,
+                                      self->hash_state.fixedOutputLength / 8);
+}
+
+
+/*[clinic input]
+_sha3.sha3_224.hexdigest
+
+Return the digest value as a string of hexadecimal digits.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_sha3_224_hexdigest_impl(SHA3object *self)
+/*[clinic end generated code: output=75ad03257906918d input=2d91bb6e0d114ee3]*/
+{
+    unsigned char digest[SHA3_MAX_DIGESTSIZE];
+    SHA3_state temp;
+    HashReturn res;
+
+    /* Get the raw (binary) digest value */
+    ENTER_HASHLIB(self);
+    SHA3_copystate(temp, self->hash_state);
+    LEAVE_HASHLIB(self);
+    res = SHA3_done(&temp, digest);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Final()");
+        return NULL;
+    }
+    return _Py_strhex((const char *)digest,
+                      self->hash_state.fixedOutputLength / 8);
+}
+
+
+/*[clinic input]
+_sha3.sha3_224.update
+
+    obj: object
+    /
+
+Update this hash object's state with the provided string.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_sha3_224_update(SHA3object *self, PyObject *obj)
+/*[clinic end generated code: output=06721d55b483e0af input=be44bf0d1c279791]*/
+{
+    Py_buffer buf;
+    HashReturn res;
+
+    GET_BUFFER_VIEW_OR_ERROUT(obj, &buf);
+
+    /* add new data, the function takes the length in bits not bytes */
+#ifdef WITH_THREAD
+    if (self->lock == NULL && buf.len >= HASHLIB_GIL_MINSIZE) {
+        self->lock = PyThread_allocate_lock();
+    }
+    /* Once a lock exists all code paths must be synchronized. We have to
+     * release the GIL even for small buffers as acquiring the lock may take
+     * an unlimited amount of time when another thread updates this object
+     * with lots of data. */
+    if (self->lock) {
+        Py_BEGIN_ALLOW_THREADS
+        PyThread_acquire_lock(self->lock, 1);
+        res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+        PyThread_release_lock(self->lock);
+        Py_END_ALLOW_THREADS
+    }
+    else {
+        res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+    }
+#else
+    res = SHA3_process(&self->hash_state, buf.buf, buf.len * 8);
+#endif
+
+    if (res != SUCCESS) {
+        PyBuffer_Release(&buf);
+        PyErr_SetString(PyExc_RuntimeError,
+                        "internal error in SHA3 Update()");
+        return NULL;
+    }
+
+    PyBuffer_Release(&buf);
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+
+
+static PyMethodDef SHA3_methods[] = {
+    _SHA3_SHA3_224_COPY_METHODDEF
+    _SHA3_SHA3_224_DIGEST_METHODDEF
+    _SHA3_SHA3_224_HEXDIGEST_METHODDEF
+    _SHA3_SHA3_224_UPDATE_METHODDEF
+    {NULL,        NULL}         /* sentinel */
+};
+
+
+static PyObject *
+SHA3_get_block_size(SHA3object *self, void *closure)
+{
+    int rate = self->hash_state.sponge.rate;
+    return PyLong_FromLong(rate / 8);
+}
+
+
+static PyObject *
+SHA3_get_name(SHA3object *self, void *closure)
+{
+    PyTypeObject *type = Py_TYPE(self);
+    if (type == &SHA3_224type) {
+        return PyUnicode_FromString("sha3_224");
+    } else if (type == &SHA3_256type) {
+        return PyUnicode_FromString("sha3_256");
+    } else if (type == &SHA3_384type) {
+        return PyUnicode_FromString("sha3_384");
+    } else if (type == &SHA3_512type) {
+        return PyUnicode_FromString("sha3_512");
+#ifdef PY_WITH_KECCAK
+    } else if (type == &Keccak_224type) {
+        return PyUnicode_FromString("keccak_224");
+    } else if (type == &Keccak_256type) {
+        return PyUnicode_FromString("keccak_256");
+    } else if (type == &Keccak_384type) {
+        return PyUnicode_FromString("keccak_384");
+    } else if (type == &Keccak_512type) {
+        return PyUnicode_FromString("keccak_512");
+#endif
+    } else if (type == &SHAKE128type) {
+        return PyUnicode_FromString("shake_128");
+    } else if (type == &SHAKE256type) {
+        return PyUnicode_FromString("shake_256");
+    } else {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+}
+
+
+static PyObject *
+SHA3_get_digest_size(SHA3object *self, void *closure)
+{
+    return PyLong_FromLong(self->hash_state.fixedOutputLength / 8);
+}
+
+
+static PyObject *
+SHA3_get_capacity_bits(SHA3object *self, void *closure)
+{
+    int capacity = 1600 - self->hash_state.sponge.rate;
+    return PyLong_FromLong(capacity);
+}
+
+
+static PyObject *
+SHA3_get_rate_bits(SHA3object *self, void *closure)
+{
+    unsigned int rate = self->hash_state.sponge.rate;
+    return PyLong_FromLong(rate);
+}
+
+static PyObject *
+SHA3_get_suffix(SHA3object *self, void *closure)
+{
+    unsigned char suffix[2];
+    suffix[0] = self->hash_state.delimitedSuffix;
+    suffix[1] = 0;
+    return PyBytes_FromStringAndSize((const char *)suffix, 1);
+}
+
+
+static PyGetSetDef SHA3_getseters[] = {
+    {"block_size", (getter)SHA3_get_block_size, NULL, NULL, NULL},
+    {"name", (getter)SHA3_get_name, NULL, NULL, NULL},
+    {"digest_size", (getter)SHA3_get_digest_size, NULL, NULL, NULL},
+    {"_capacity_bits", (getter)SHA3_get_capacity_bits, NULL, NULL, NULL},
+    {"_rate_bits", (getter)SHA3_get_rate_bits, NULL, NULL, NULL},
+    {"_suffix", (getter)SHA3_get_suffix, NULL, NULL, NULL},
+    {NULL}  /* Sentinel */
+};
+
+
+#define SHA3_TYPE(type_obj, type_name, type_doc, type_methods) \
+    static PyTypeObject type_obj = { \
+        PyVarObject_HEAD_INIT(NULL, 0) \
+        type_name,          /* tp_name */ \
+        sizeof(SHA3object), /* tp_size */ \
+        0,                  /* tp_itemsize */ \
+        /*  methods  */ \
+        (destructor)SHA3_dealloc, /* tp_dealloc */ \
+        0,                  /* tp_print */ \
+        0,                  /* tp_getattr */ \
+        0,                  /* tp_setattr */ \
+        0,                  /* tp_reserved */ \
+        0,                  /* tp_repr */ \
+        0,                  /* tp_as_number */ \
+        0,                  /* tp_as_sequence */ \
+        0,                  /* tp_as_mapping */ \
+        0,                  /* tp_hash */ \
+        0,                  /* tp_call */ \
+        0,                  /* tp_str */ \
+        0,                  /* tp_getattro */ \
+        0,                  /* tp_setattro */ \
+        0,                  /* tp_as_buffer */ \
+        Py_TPFLAGS_DEFAULT, /* tp_flags */ \
+        type_doc,           /* tp_doc */ \
+        0,                  /* tp_traverse */ \
+        0,                  /* tp_clear */ \
+        0,                  /* tp_richcompare */ \
+        0,                  /* tp_weaklistoffset */ \
+        0,                  /* tp_iter */ \
+        0,                  /* tp_iternext */ \
+        type_methods,       /* tp_methods */ \
+        NULL,               /* tp_members */ \
+        SHA3_getseters,     /* tp_getset */ \
+        0,                  /* tp_base */ \
+        0,                  /* tp_dict */ \
+        0,                  /* tp_descr_get */ \
+        0,                  /* tp_descr_set */ \
+        0,                  /* tp_dictoffset */ \
+        0,                  /* tp_init */ \
+        0,                  /* tp_alloc */ \
+        py_sha3_new,        /* tp_new */ \
+    }
+
+PyDoc_STRVAR(sha3_256__doc__,
+"sha3_256([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 32 bytes.");
+
+PyDoc_STRVAR(sha3_384__doc__,
+"sha3_384([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 48 bytes.");
+
+PyDoc_STRVAR(sha3_512__doc__,
+"sha3_512([string]) -> SHA3 object\n\
+\n\
+Return a new SHA3 hash object with a hashbit length of 64 bytes.");
+
+SHA3_TYPE(SHA3_224type, "_sha3.sha3_224", py_sha3_new__doc__, SHA3_methods);
+SHA3_TYPE(SHA3_256type, "_sha3.sha3_256", sha3_256__doc__, SHA3_methods);
+SHA3_TYPE(SHA3_384type, "_sha3.sha3_384", sha3_384__doc__, SHA3_methods);
+SHA3_TYPE(SHA3_512type, "_sha3.sha3_512", sha3_512__doc__, SHA3_methods);
+
+#ifdef PY_WITH_KECCAK
+PyDoc_STRVAR(keccak_224__doc__,
+"keccak_224([string]) -> Keccak object\n\
+\n\
+Return a new Keccak hash object with a hashbit length of 28 bytes.");
+
+PyDoc_STRVAR(keccak_256__doc__,
+"keccak_256([string]) -> Keccak object\n\
+\n\
+Return a new Keccak hash object with a hashbit length of 32 bytes.");
+
+PyDoc_STRVAR(keccak_384__doc__,
+"keccak_384([string]) -> Keccak object\n\
+\n\
+Return a new Keccak hash object with a hashbit length of 48 bytes.");
+
+PyDoc_STRVAR(keccak_512__doc__,
+"keccak_512([string]) -> Keccak object\n\
+\n\
+Return a new Keccak hash object with a hashbit length of 64 bytes.");
+
+SHA3_TYPE(Keccak_224type, "_sha3.keccak_224", keccak_224__doc__, SHA3_methods);
+SHA3_TYPE(Keccak_256type, "_sha3.keccak_256", keccak_256__doc__, SHA3_methods);
+SHA3_TYPE(Keccak_384type, "_sha3.keccak_384", keccak_384__doc__, SHA3_methods);
+SHA3_TYPE(Keccak_512type, "_sha3.keccak_512", keccak_512__doc__, SHA3_methods);
+#endif
+
+
+static PyObject *
+_SHAKE_digest(SHA3object *self, unsigned long digestlen, int hex)
+{
+    unsigned char *digest = NULL;
+    SHA3_state temp;
+    int res;
+    PyObject *result = NULL;
+
+    if ((digest = (unsigned char*)PyMem_Malloc(digestlen)) == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    /* Get the raw (binary) digest value */
+    ENTER_HASHLIB(self);
+    SHA3_copystate(temp, self->hash_state);
+    LEAVE_HASHLIB(self);
+    res = SHA3_done(&temp, NULL);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 done()");
+        goto error;
+    }
+    res = SHA3_squeeze(&temp, digest, digestlen * 8);
+    if (res != SUCCESS) {
+        PyErr_SetString(PyExc_RuntimeError, "internal error in SHA3 Squeeze()");
+        return NULL;
+    }
+    if (hex) {
+         result = _Py_strhex((const char *)digest, digestlen);
+    } else {
+        result = PyBytes_FromStringAndSize((const char *)digest,
+                                           digestlen);
+    }
+  error:
+    if (digest != NULL) {
+        PyMem_Free(digest);
+    }
+    return result;
+}
+
+
+/*[clinic input]
+_sha3.shake_128.digest
+
+    length: unsigned_long(bitwise=True)
+    \
+
+Return the digest value as a string of binary data.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_shake_128_digest_impl(SHA3object *self, unsigned long length)
+/*[clinic end generated code: output=2313605e2f87bb8f input=608c8ca80ae9d115]*/
+{
+    return _SHAKE_digest(self, length, 0);
+}
+
+
+/*[clinic input]
+_sha3.shake_128.hexdigest
+
+    length: unsigned_long(bitwise=True)
+    \
+
+Return the digest value as a string of hexadecimal digits.
+[clinic start generated code]*/
+
+static PyObject *
+_sha3_shake_128_hexdigest_impl(SHA3object *self, unsigned long length)
+/*[clinic end generated code: output=bf8e2f1e490944a8 input=64e56b4760db4573]*/
+{
+    return _SHAKE_digest(self, length, 1);
+}
+
+
+static PyMethodDef SHAKE_methods[] = {
+    _SHA3_SHA3_224_COPY_METHODDEF
+    _SHA3_SHAKE_128_DIGEST_METHODDEF
+    _SHA3_SHAKE_128_HEXDIGEST_METHODDEF
+    _SHA3_SHA3_224_UPDATE_METHODDEF
+    {NULL,        NULL}         /* sentinel */
+};
+
+PyDoc_STRVAR(shake_128__doc__,
+"shake_128([string]) -> SHAKE object\n\
+\n\
+Return a new SHAKE hash object.");
+
+PyDoc_STRVAR(shake_256__doc__,
+"shake_256([string]) -> SHAKE object\n\
+\n\
+Return a new SHAKE hash object.");
+
+SHA3_TYPE(SHAKE128type, "_sha3.shake_128", shake_128__doc__, SHAKE_methods);
+SHA3_TYPE(SHAKE256type, "_sha3.shake_256", shake_256__doc__, SHAKE_methods);
+
+
+/* Initialize this module. */
+static struct PyModuleDef _SHA3module = {
+        PyModuleDef_HEAD_INIT,
+        "_sha3",
+        NULL,
+        -1,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+
+
+PyMODINIT_FUNC
+PyInit__sha3(void)
+{
+    PyObject *m = NULL;
+
+    m = PyModule_Create(&_SHA3module);
+
+#define init_sha3type(name, type)     \
+    do {                              \
+        Py_TYPE(type) = &PyType_Type; \
+        if (PyType_Ready(type) < 0) { \
+            goto error;               \
+        }                             \
+        Py_INCREF((PyObject *)type);  \
+        if (PyModule_AddObject(m, name, (PyObject *)type) < 0) { \
+            goto error;               \
+        }                             \
+    } while(0)
+
+    init_sha3type("sha3_224", &SHA3_224type);
+    init_sha3type("sha3_256", &SHA3_256type);
+    init_sha3type("sha3_384", &SHA3_384type);
+    init_sha3type("sha3_512", &SHA3_512type);
+#ifdef PY_WITH_KECCAK
+    init_sha3type("keccak_224", &Keccak_224type);
+    init_sha3type("keccak_256", &Keccak_256type);
+    init_sha3type("keccak_384", &Keccak_384type);
+    init_sha3type("keccak_512", &Keccak_512type);
+#endif
+    init_sha3type("shake_128", &SHAKE128type);
+    init_sha3type("shake_256", &SHAKE256type);
+
+#undef init_sha3type
+
+    if (PyModule_AddIntConstant(m, "keccakopt", KeccakOpt) < 0) {
+        goto error;
+    }
+    if (PyModule_AddStringConstant(m, "implementation",
+                                   KeccakP1600_implementation) < 0) {
+        goto error;
+    }
+
+    return m;
+  error:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/PC/config.c b/PC/config.c
index e007d4ba69e..43d9e208cca 100644
--- a/PC/config.c
+++ b/PC/config.c
@@ -23,6 +23,7 @@ extern PyObject* PyInit__signal(void);
 extern PyObject* PyInit__sha1(void);
 extern PyObject* PyInit__sha256(void);
 extern PyObject* PyInit__sha512(void);
+extern PyObject* PyInit__sha3(void);
 extern PyObject* PyInit__blake2(void);
 extern PyObject* PyInit_time(void);
 extern PyObject* PyInit__thread(void);
@@ -97,6 +98,7 @@ struct _inittab _PyImport_Inittab[] = {
     {"_sha1", PyInit__sha1},
     {"_sha256", PyInit__sha256},
     {"_sha512", PyInit__sha512},
+    {"_sha3", PyInit__sha3},
     {"_blake2", PyInit__blake2},
     {"time", PyInit_time},
 #ifdef WITH_THREAD
diff --git a/setup.py b/setup.py
index 5450b7ed75b..6b81569d4c9 100644
--- a/setup.py
+++ b/setup.py
@@ -905,6 +905,13 @@ class PyBuildExt(build_ext):
                                define_macros=blake2_macros,
                                depends=blake2_deps) )
 
+        sha3_deps = glob(os.path.join(os.getcwd(), srcdir,
+                                      'Modules/_sha3/kcp/*'))
+        sha3_deps.append('hashlib.h')
+        exts.append( Extension('_sha3',
+                               ['_sha3/sha3module.c'],
+                               depends=sha3_deps))
+
         # Modules that provide persistent dictionary-like semantics.  You will
         # probably want to arrange for at least one of them to be available on
         # your machine, though none are defined by default because of library