From 6d7ba58f880be618ade07f8ea080fe8c4bf8a896 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Wed, 3 Apr 2024 03:10:44 -0400 Subject: venv --- .../python3.11/site-packages/websockets/speedups.c | 223 +++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 venv/lib/python3.11/site-packages/websockets/speedups.c (limited to 'venv/lib/python3.11/site-packages/websockets/speedups.c') diff --git a/venv/lib/python3.11/site-packages/websockets/speedups.c b/venv/lib/python3.11/site-packages/websockets/speedups.c new file mode 100644 index 0000000..a195904 --- /dev/null +++ b/venv/lib/python3.11/site-packages/websockets/speedups.c @@ -0,0 +1,223 @@ +/* C implementation of performance sensitive functions. */ + +#define PY_SSIZE_T_CLEAN +#include +#include /* uint8_t, uint32_t, uint64_t */ + +#if __ARM_NEON +#include +#elif __SSE2__ +#include +#endif + +static const Py_ssize_t MASK_LEN = 4; + +/* Similar to PyBytes_AsStringAndSize, but accepts more types */ + +static int +_PyBytesLike_AsStringAndSize(PyObject *obj, PyObject **tmp, char **buffer, Py_ssize_t *length) +{ + // This supports bytes, bytearrays, and memoryview objects, + // which are common data structures for handling byte streams. + // websockets.framing.prepare_data() returns only these types. + // If *tmp isn't NULL, the caller gets a new reference. + if (PyBytes_Check(obj)) + { + *tmp = NULL; + *buffer = PyBytes_AS_STRING(obj); + *length = PyBytes_GET_SIZE(obj); + } + else if (PyByteArray_Check(obj)) + { + *tmp = NULL; + *buffer = PyByteArray_AS_STRING(obj); + *length = PyByteArray_GET_SIZE(obj); + } + else if (PyMemoryView_Check(obj)) + { + *tmp = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); + if (*tmp == NULL) + { + return -1; + } + Py_buffer *mv_buf; + mv_buf = PyMemoryView_GET_BUFFER(*tmp); + *buffer = mv_buf->buf; + *length = mv_buf->len; + } + else + { + PyErr_Format( + PyExc_TypeError, + "expected a bytes-like object, %.200s found", + Py_TYPE(obj)->tp_name); + return -1; + } + + return 0; +} + +/* C implementation of websockets.utils.apply_mask */ + +static PyObject * +apply_mask(PyObject *self, PyObject *args, PyObject *kwds) +{ + + // In order to support various bytes-like types, accept any Python object. + + static char *kwlist[] = {"data", "mask", NULL}; + PyObject *input_obj; + PyObject *mask_obj; + + // A pointer to a char * + length will be extracted from the data and mask + // arguments, possibly via a Py_buffer. + + PyObject *input_tmp = NULL; + char *input; + Py_ssize_t input_len; + PyObject *mask_tmp = NULL; + char *mask; + Py_ssize_t mask_len; + + // Initialize a PyBytesObject then get a pointer to the underlying char * + // in order to avoid an extra memory copy in PyBytes_FromStringAndSize. + + PyObject *result = NULL; + char *output; + + // Other variables. + + Py_ssize_t i = 0; + + // Parse inputs. + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "OO", kwlist, &input_obj, &mask_obj)) + { + goto exit; + } + + if (_PyBytesLike_AsStringAndSize(input_obj, &input_tmp, &input, &input_len) == -1) + { + goto exit; + } + + if (_PyBytesLike_AsStringAndSize(mask_obj, &mask_tmp, &mask, &mask_len) == -1) + { + goto exit; + } + + if (mask_len != MASK_LEN) + { + PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes"); + goto exit; + } + + // Create output. + + result = PyBytes_FromStringAndSize(NULL, input_len); + if (result == NULL) + { + goto exit; + } + + // Since we just created result, we don't need error checks. + output = PyBytes_AS_STRING(result); + + // Perform the masking operation. + + // Apparently GCC cannot figure out the following optimizations by itself. + + // We need a new scope for MSVC 2010 (non C99 friendly) + { +#if __ARM_NEON + + // With NEON support, XOR by blocks of 16 bytes = 128 bits. + + Py_ssize_t input_len_128 = input_len & ~15; + uint8x16_t mask_128 = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t *)mask)); + + for (; i < input_len_128; i += 16) + { + uint8x16_t in_128 = vld1q_u8((uint8_t *)(input + i)); + uint8x16_t out_128 = veorq_u8(in_128, mask_128); + vst1q_u8((uint8_t *)(output + i), out_128); + } + +#elif __SSE2__ + + // With SSE2 support, XOR by blocks of 16 bytes = 128 bits. + + // Since we cannot control the 16-bytes alignment of input and output + // buffers, we rely on loadu/storeu rather than load/store. + + Py_ssize_t input_len_128 = input_len & ~15; + __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask); + + for (; i < input_len_128; i += 16) + { + __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i)); + __m128i out_128 = _mm_xor_si128(in_128, mask_128); + _mm_storeu_si128((__m128i *)(output + i), out_128); + } + +#else + + // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits. + + // We assume the memory allocator aligns everything on 8 bytes boundaries. + + Py_ssize_t input_len_64 = input_len & ~7; + uint32_t mask_32 = *(uint32_t *)mask; + uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32; + + for (; i < input_len_64; i += 8) + { + *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64; + } + +#endif + } + + // XOR the remainder of the input byte by byte. + + for (; i < input_len; i++) + { + output[i] = input[i] ^ mask[i & (MASK_LEN - 1)]; + } + +exit: + Py_XDECREF(input_tmp); + Py_XDECREF(mask_tmp); + return result; + +} + +static PyMethodDef speedups_methods[] = { + { + "apply_mask", + (PyCFunction)apply_mask, + METH_VARARGS | METH_KEYWORDS, + "Apply masking to the data of a WebSocket message.", + }, + {NULL, NULL, 0, NULL}, /* Sentinel */ +}; + +static struct PyModuleDef speedups_module = { + PyModuleDef_HEAD_INIT, + "websocket.speedups", /* m_name */ + "C implementation of performance sensitive functions.", + /* m_doc */ + -1, /* m_size */ + speedups_methods, /* m_methods */ + NULL, + NULL, + NULL, + NULL +}; + +PyMODINIT_FUNC +PyInit_speedups(void) +{ + return PyModule_Create(&speedups_module); +} -- cgit v1.2.3