#include <onload/extensions_zc.h
1.1.Zero‐Copy Data Buffers
To avoid the copy data is passed to and from the application in special buffers described by a struct onload_zc_iovec
. A message or datagram can consist of multiple iovecs using a struct onload_zc_msg
. A single call to send may involve
multiple messages using an array of
struct onload_zc_mmsg
.
/* A zc_iovec describes a single buffer */struct onload_zc_iovec {void* iov_base; /* Address within buffer */size_t iov_len; /* Length of data */onload_zc_handle buf; /* (opaque) buffer handle */unsigned iov_flags; /* Not currently used */};/* A msg describes array of iovecs that make up datagram */struct onload_zc_msg {struct onload_zc_iovec* iov; /* Array of buffers */struct msghdr msghdr; /* Message metadata */};/* An mmsg describes a message, the socket, and its result */struct onload_zc_mmsg {struct onload_zc_msg msg; /* Message */int rc; /* Result of send operation */int fd; /* socket to send on */};
1.2.Zero‐Copy TCP Send Overview
Figure 31 illustrates the difference between the normal TCP transmit method and
the zero‐ copy method.
When using standard POSIX socket calls, the application first creates the payload
data in an application allocated buffer before calling the
send()
function. Onload
will copy the data to a Onload packet buffer in memory and post a descriptor to this
buffer in the network adapter TX descriptor ring.
Using the zero‐copy TCP transmit API the application calls the
onload_zc_alloc_buffers()
function to request buffers from Onload. A pointer
to a packet buffer is returned in response. The application places the data to send
directly into this buffer and then calls
onload_zc_send()
to indicate to Onload that
data is available to send.
Onload will post a descriptor for the packet buffer in the network adapter TX
descriptor ring and ring the TX doorbell. The network adapter fetches the data for
transmission.
The socket used to allocate zero‐copy buffers must be in the same stack as
the socket used to send the buffers. When using TCP loopback, Onload can move a
socket from one stack to another. Users must ensure that they
ALWAYS USE
BUFFERS FROM THE CORRECT STACK
.
Zero‐copy TCP transmit is implemented within the Onload Extensions API.
2.3.Zero‐Copy TCP Send
The zero‐copy send API supports the sending of multiple messages to different
sockets in a single call. Data buffers must be allocated in advance and for best
efficiency these should be allocated in blocks and off the critical path. The user
should avoid simply moving the copy from Onload into the application, but where
this is unavoidable, it should also be done off the critical path.
int onload_zc_send(struct onload_zc_mmsg* msgs, int mlen, int flags);
int onload_zc_alloc_buffers(int fd,
struct onload_zc_iovec* iovecs,
int iovecs_len,
onload_zc_buffer_type_flags flags);
int onload_zc_release_buffers(int fd,
onload_zc_handle* bufs,
int bufs_len);
The onload_zc_send()
function return value identifies how many of the
onload_zc_mmsg
array’s rc fields are set. Each
onload_zc_mmsg.rc
returns how
many bytes (or error) were sent in for that message. Refer to the table below.
Buffers sent with the ONLOAD_MSG_WARM feature enabled are not
actually sent buffers, ownership remains with the user who is responsible for
freeing these buffers.
2.4.Zero‐Copy Send ‐ Single Message, Single Buffer
struct onload_zc_iovec iovec;
struct onload_zc_mmsg mmsg;
rc = onload_zc_alloc_buffers(fd, &iovec, 1, ONLOAD_ZC_BUFFER_HDR_TCP);
assert(rc == O);
assert(my_data_len <= iovec.iov_len);
memcpy(iovec.iov_base, my_data, my_data_len);
iovec.iov_len = my_data_len;
mmsg.fd = fd;
mmsg.msg.iov = &iovec;
mmsg.msg.msghdr.msg_iovlen = 1;
rc = onload_zc_send(&mmsg, 1, 0);
if( rc <= 0) {
/* Probably application bug */
return rc;
} else {
/* Only one message, so rc should be 1 */
assert(rc == 1);
/* rc == 1 so we can look at the first (only) mmsg.rc */
if( mmsg.rc < 0 )
/* Error sending message */
onload_zc_release_buffers(fd, &iovec.buf, 1);
else
/* Message sent, single msg, single iovec so
* shouldn't worry about partial sends */
assert(mmsg.rc == my_data_len);
}
The example above demonstrates error code handling. Note it contains an examples
of bad practice where buffers are allocated and populated on the critical path.
2.5.Zero‐Copy Send ‐ Multiple Message, Multiple Buffers
#define N_BUFFERS 2
#define N_MSGS 2
struct onload_zc_iovec iovec[N_MSGS][N_BUFFERS];
struct onload_zc_mmsg mmsg[N_MSGS];
for( i = 0; i < N_MSGS; ++i ) {
rc = onload_zc_alloc_buffers(fd, iovec[i], N_BUFFERS, ONLOAD_ZC_BUFFER_HDR_TCP);
assert(rc == 0);
/* TODO store data in iovec[i][j].iov_base,
* set iovec[i][j]iov_len */
mmsg[i]fd = fd; /* Could be different for each message */
mmsg[i].iov = iovec[i];
mmsg[i].msg.msghdr.msg_iovlen = N_BUFFERS;
}
rc = onload_zc_send(mmsg, N_MSGS, 0);
if( rc <= 0 ) {
/* Probably application bug */
return rc;
} else {
for( i = 0; i < N_MSGS; ++i ) {
if( i < rc ) {
/* mmsg[i] is set and we can use it */
if( mmsg[i] < 0) {
/* error sending this message ‐ release buffers */
for( j = 0; j < N_BUFFERS; ++j )
onload_zc_release_buffers(fd, &iovec[i][j].buf, 1);
} else if( mmsg(i] < sum_over_j(iovec[i][j].iov_len) ) {
/* partial success */
/* TODO use mmsg[i] to determine which buffers in
* iovec[i] array are sent and which are still
* owned by application */
} else {
/* Whole message sent, buffers now owned by Onload */
}
} else {
/* mmsg[i] is not set, this message was not sent */
for( j = 0; j < N_BUFFERS; ++j )
onload_zc_release_buffers(fd, &iovec[i][j].buf, 1);
}
} }
The example above demonstrates error code handling and contains some examples
of bad practice where buffers are allocated and populated on the critical path.
2.6.Zero‐Copy Send ‐ Full Example
static struct onload_zc_iovec iovec[NUM_ZC_BUFFERS];
static ssize_t do_send_zc(int fd, const void* buf, size_t len, int flags)
{
int bytes_done, rc, i, bufs_needed;
struct onload_zc_mmsg mmsg;
mmsg.fd = fd;
mmsg.msg.iov = iovec;
bytes_done = 0;
mmsg.msg.msghdr.msg_iovlen = 0;
while( bytes_done < len ) {
if( iovec[mmsg.msg.msghdr.msg_iovlen].iov_len > (len ‐ bytes_done))
iovec[mmsg.msg.msghdr.msg_iovlen].iov_len = (len ‐ bytes_done);
memcpy(iovec[i].iov_base, buf+bytes_done, iov_len);
bytes_done += iovec[mmsg.msg.msghdr.msg_iovlen].iov_len;
++mmsg.msg.msghdr.msg_iovlen;
}
rc = onload_zc_send(&mmsg, 1, 0);
if( rc != 1 /* Number of messages we sent */ ) {
printf("onload_zc_send failed to process msg, %d\n", rc);
return ‐1;
} else {
if( mmsg.rc < 0 )
printf("onload_zc_send message error %d\n", mmsg.rc);
else {
/* Iterate over the iovecs; any that were sent we must
replenish. */
i = 0; bufs_needed= 0;
while( i < mmsg.msg.msghdr.msg_iovlen ) {
if( bytes_done == mmsg.rc ) {
printf(onload_zc_send did not send iovec %d\n", i);
/* In other buffer allocation schemes we would have to
release
* these buffers, but seems pointless as we guarantee at the
* end of this function to have iovec array full, so do
nothing. */
} else {
/* Buffer sent, now owned by Onload, so replenish iovec
array */
++bufs needed;
bytes_done += iovec[i].iov_len;
}
++i;
}
if( bufs_needed ) /* replenish the iovec array */
rc = onload_zc_alloc_buffers(fd, iovec, bufs_needed, ONLOAD_ZC_BUFFER_HDR_TCP);
}
}
/* Set a return code that looks similar enough to send(). NB. we're
* not setting (and neither does onload_zc_send()) errno */
if( mmsg.rc < 0 ) return ‐1;
else return bytes_done;
}