summaryrefslogtreecommitdiff
path: root/venv/lib/python3.11/site-packages/websockets/speedups.c
blob: a195904198f40078b7e772baafce610838b31db1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* C implementation of performance sensitive functions. */

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdint.h> /* uint8_t, uint32_t, uint64_t */

#if __ARM_NEON
#include <arm_neon.h>
#elif __SSE2__
#include <emmintrin.h>
#endif

static const Py_ssize_t MASK_LEN = 4;

/* Similar to PyBytes_AsStringAndSize, but accepts more types */

static int
_PyBytesLike_AsStringAndSize(PyObject *obj, PyObject **tmp, char **buffer, Py_ssize_t *length)
{
    // This supports bytes, bytearrays, and memoryview objects,
    // which are common data structures for handling byte streams.
    // websockets.framing.prepare_data() returns only these types.
    // If *tmp isn't NULL, the caller gets a new reference.
    if (PyBytes_Check(obj))
    {
        *tmp = NULL;
        *buffer = PyBytes_AS_STRING(obj);
        *length = PyBytes_GET_SIZE(obj);
    }
    else if (PyByteArray_Check(obj))
    {
        *tmp = NULL;
        *buffer = PyByteArray_AS_STRING(obj);
        *length = PyByteArray_GET_SIZE(obj);
    }
    else if (PyMemoryView_Check(obj))
    {
        *tmp = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
        if (*tmp == NULL)
        {
            return -1;
        }
        Py_buffer *mv_buf;
        mv_buf = PyMemoryView_GET_BUFFER(*tmp);
        *buffer = mv_buf->buf;
        *length = mv_buf->len;
    }
    else
    {
        PyErr_Format(
            PyExc_TypeError,
            "expected a bytes-like object, %.200s found",
            Py_TYPE(obj)->tp_name);
        return -1;
    }

    return 0;
}

/* C implementation of websockets.utils.apply_mask */

static PyObject *
apply_mask(PyObject *self, PyObject *args, PyObject *kwds)
{

    // In order to support various bytes-like types, accept any Python object.

    static char *kwlist[] = {"data", "mask", NULL};
    PyObject *input_obj;
    PyObject *mask_obj;

    // A pointer to a char * + length will be extracted from the data and mask
    // arguments, possibly via a Py_buffer.

    PyObject *input_tmp = NULL;
    char *input;
    Py_ssize_t input_len;
    PyObject *mask_tmp = NULL;
    char *mask;
    Py_ssize_t mask_len;

    // Initialize a PyBytesObject then get a pointer to the underlying char *
    // in order to avoid an extra memory copy in PyBytes_FromStringAndSize.

    PyObject *result = NULL;
    char *output;

    // Other variables.

    Py_ssize_t i = 0;

    // Parse inputs.

    if (!PyArg_ParseTupleAndKeywords(
            args, kwds, "OO", kwlist, &input_obj, &mask_obj))
    {
        goto exit;
    }

    if (_PyBytesLike_AsStringAndSize(input_obj, &input_tmp, &input, &input_len) == -1)
    {
        goto exit;
    }

    if (_PyBytesLike_AsStringAndSize(mask_obj, &mask_tmp, &mask, &mask_len) == -1)
    {
        goto exit;
    }

    if (mask_len != MASK_LEN)
    {
        PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes");
        goto exit;
    }

    // Create output.

    result = PyBytes_FromStringAndSize(NULL, input_len);
    if (result == NULL)
    {
        goto exit;
    }

    // Since we just created result, we don't need error checks.
    output = PyBytes_AS_STRING(result);

    // Perform the masking operation.

    // Apparently GCC cannot figure out the following optimizations by itself.

    // We need a new scope for MSVC 2010 (non C99 friendly)
    {
#if __ARM_NEON

        // With NEON support, XOR by blocks of 16 bytes = 128 bits.

        Py_ssize_t input_len_128 = input_len & ~15;
        uint8x16_t mask_128 = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t *)mask));

        for (; i < input_len_128; i += 16)
        {
            uint8x16_t in_128 = vld1q_u8((uint8_t *)(input + i));
            uint8x16_t out_128 = veorq_u8(in_128, mask_128);
            vst1q_u8((uint8_t *)(output + i), out_128);
        }

#elif __SSE2__

        // With SSE2 support, XOR by blocks of 16 bytes = 128 bits.

        // Since we cannot control the 16-bytes alignment of input and output
        // buffers, we rely on loadu/storeu rather than load/store.

        Py_ssize_t input_len_128 = input_len & ~15;
        __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask);

        for (; i < input_len_128; i += 16)
        {
            __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i));
            __m128i out_128 = _mm_xor_si128(in_128, mask_128);
            _mm_storeu_si128((__m128i *)(output + i), out_128);
        }

#else

        // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits.

        // We assume the memory allocator aligns everything on 8 bytes boundaries.

        Py_ssize_t input_len_64 = input_len & ~7;
        uint32_t mask_32 = *(uint32_t *)mask;
        uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32;

        for (; i < input_len_64; i += 8)
        {
            *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64;
        }

#endif
    }

    // XOR the remainder of the input byte by byte.

    for (; i < input_len; i++)
    {
        output[i] = input[i] ^ mask[i & (MASK_LEN - 1)];
    }

exit:
    Py_XDECREF(input_tmp);
    Py_XDECREF(mask_tmp);
    return result;

}

static PyMethodDef speedups_methods[] = {
    {
        "apply_mask",
        (PyCFunction)apply_mask,
        METH_VARARGS | METH_KEYWORDS,
        "Apply masking to the data of a WebSocket message.",
    },
    {NULL, NULL, 0, NULL},      /* Sentinel */
};

static struct PyModuleDef speedups_module = {
    PyModuleDef_HEAD_INIT,
    "websocket.speedups",       /* m_name */
    "C implementation of performance sensitive functions.",
                                /* m_doc */
    -1,                         /* m_size */
    speedups_methods,           /* m_methods */
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC
PyInit_speedups(void)
{
    return PyModule_Create(&speedups_module);
}