|
From: Stephens, A. <all...@wi...> - 2006-08-09 18:50:08
|
Hi there:
About a week ago, I realized that the send_stream() routine in TIPC
1.5/1.6 uses an algorithm that may be significantly less than optimal.
Specifically, it partitions the outgoing byte stream into chunks of up
to 66000 bytes, and then transmits each chunk in the same way that
non-stream messages are sent. As a result, any chunk that exceeds the
link MTU is broken up into message fragments, which must be reassembled
into a single message at the receiving end. This seems to be a silly
thing to be doing for a byte-stream-oriented socket! Not only are we
doing unnecessary memory allocation and data copying on the receiving
end, we're also introducing unnecessary overhead into the wire protocol
through the use of 40 byte fragment headers where a 24 byte connected
message header would be sufficient.
=20
I've coded up a prototype patch (see below) that avoids the unwanted
fragmentation problem by using the port's maximum packet size "hint"
field to help it determine how big a chunk can be sent without exceeding
the link MTU. (For intra-node connections we should send chunks of
66000 bytes, but I don't think my patch handles this correctly yet; some
minor tweaking is required to fix this up.) I did a quick test to
verify that it did eliminate the fragmentation problem, but I haven't
yet tried to determine what (if any) improvement this has on the
performance of TIPC's stream sockets.
Before I go any further, I thought I'd ask for a bit of feedback on a
few points.
1) The maximum packet size hint field is currently a non-public field of
the "port" structure, so I had to do a type cast to allow the socket
code to access it. Is this OK, or should we really move this field to
the "tipc_port" structure where all of the other public fields are kept?
2) The fact that the receiving socket will now end up queuing a lot of
small messages rather than a lesser number of larger ones means that, by
default, the receiving application will end up making a lot more calls
to recv() to retrieve a given amount of data than it did before (since
recv_stream() only consumes one message at a time if MSG_WAITALL is not
specified). I'm concerned that the added overhead of performing a lot
more system calls will erode the performance gains obtained by not have
to reassemble message fragments. Does anyone have any idea if this is
likely to be true or not? If so, is requiring applications to specify
MSG_WAITALL to force consolidation of short chunks an acceptable
workaround?
3) I think this revised algorithm works even in the event that the link
MTU changes on the fly as a result of link switchover, although
switching over to a link with a smaller MTU may cause fragments to be
generated for the first chunk that goes over the new link.
(send_stream() should use the updated max packet hint for all subsequent
chunks.) But are there other cases I haven't considered? (For example,
what is going to happen in TIPC 1.7/1.8 in the case of messages that are
routed through an intermediate node? The MTU hint only prevents
fragmentation on the first hop ...)
=20
Regards,
Al
=3D=3D=3D=3D=3D=3D=3D Revised routine
/**=20
* send_stream - send stream-oriented data
* @iocb: (unused)
* @sock: socket structure
* @m: data to send
* @total_len: total length of data to be sent
*=20
* Used for SOCK_STREAM data.
*=20
* Returns the number of bytes sent on success (or partial success),=20
* or errno if no data sent
*/
static int send_stream(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len)
{
struct port *p_ptr;
struct msghdr my_msg;
struct iovec my_iov;
struct iovec *curr_iov;
int curr_iovlen;
char __user *curr_start;
u32 hdr_size;
int curr_left;
int bytes_to_send;
int bytes_sent;
int res;
/* Handle special cases where there is no connection */
if (unlikely(sock->state !=3D SS_CONNECTED)) {
if (sock->state =3D=3D SS_UNCONNECTED)
return send_packet(iocb, sock, m, total_len);
else if (sock->state =3D=3D SS_DISCONNECTING)
return -EPIPE; =20
else
return -ENOTCONN;
}
if (unlikely(m->msg_name))
return -EISCONN;
/*=20
* Send each iovec entry using one or more messages=20
*
* Note: This algorithm is good for the most likely case=20
* (i.e. one large iovec entry), but could be improved to pass
sets
* of small iovec entries into send_packet().
*/
curr_iov =3D m->msg_iov;
curr_iovlen =3D m->msg_iovlen;
my_msg.msg_iov =3D &my_iov;
my_msg.msg_iovlen =3D 1;
my_msg.msg_flags =3D m->msg_flags;
my_msg.msg_name =3D NULL;
bytes_sent =3D 0;
p_ptr =3D (struct port *)tipc_sk(sock->sk)->p;
hdr_size =3D msg_hdr_sz(&p_ptr->publ.phdr);
while (curr_iovlen--) {
curr_start =3D curr_iov->iov_base;
curr_left =3D curr_iov->iov_len;
while (curr_left) {
bytes_to_send =3D p_ptr->max_pkt - hdr_size;
if (curr_left < bytes_to_send)
bytes_to_send =3D curr_left;
my_iov.iov_base =3D curr_start;
my_iov.iov_len =3D bytes_to_send;
if ((res =3D send_packet(iocb, sock, &my_msg, =
0))
< 0) {
return bytes_sent ? bytes_sent : res;
}
curr_left -=3D bytes_to_send;
curr_start +=3D bytes_to_send;
bytes_sent +=3D bytes_to_send;
}
curr_iov++;
}
return bytes_sent;
}
=3D=3D=3D=3D=3D=3D=3D=3D=3D Patch file
---
net/tipc/socket.c | 22 ++++++++++++++--------
1 files changed, 14 insertions(+), 8 deletions(-)
a4e3d3f13461f4748b9fd9b65d1342b3f8ceeb57
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a7069b7..fff773d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -54,6 +54,7 @@ #include <net/tipc/tipc_msg.h>
#include <net/tipc/tipc_port.h>
=20
#include "core.h"
+#include "port.h"
=20
#define SS_LISTENING -1 /* socket is listening */
#define SS_READY -2 /* socket is connectionless */
@@ -607,23 +608,24 @@ exit:
static int send_stream(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len)
{
+ struct port *p_ptr;
struct msghdr my_msg;
struct iovec my_iov;
struct iovec *curr_iov;
int curr_iovlen;
char __user *curr_start;
+ u32 hdr_size;
int curr_left;
int bytes_to_send;
int bytes_sent;
int res;
-=09
- if (likely(total_len <=3D TIPC_MAX_USER_MSG_SIZE))
- return send_packet(iocb, sock, m, total_len);
=20
- /* Can only send large data streams if already connected */
+ /* Handle special cases where there is no connection */
=20
if (unlikely(sock->state !=3D SS_CONNECTED)) {
- if (sock->state =3D=3D SS_DISCONNECTING)
+ if (sock->state =3D=3D SS_UNCONNECTED)
+ return send_packet(iocb, sock, m, total_len);
+ else if (sock->state =3D=3D SS_DISCONNECTING)
return -EPIPE; =20
else
return -ENOTCONN;
@@ -633,7 +635,7 @@ static int send_stream(struct kiocb *ioc
return -EISCONN;
=20
/*=20
- * Send each iovec entry using one or more messages
+ * Send each iovec entry using one or more messages=20
*
* Note: This algorithm is good for the most likely case=20
* (i.e. one large iovec entry), but could be improved to pass
sets
@@ -648,13 +650,17 @@ static int send_stream(struct kiocb *ioc
my_msg.msg_name =3D NULL;
bytes_sent =3D 0;
=20
+ p_ptr =3D (struct port *)tipc_sk(sock->sk)->p;
+ hdr_size =3D msg_hdr_sz(&p_ptr->publ.phdr);
+
while (curr_iovlen--) {
curr_start =3D curr_iov->iov_base;
curr_left =3D curr_iov->iov_len;
=20
while (curr_left) {
- bytes_to_send =3D (curr_left <
TIPC_MAX_USER_MSG_SIZE)
- ? curr_left : TIPC_MAX_USER_MSG_SIZE;
+ bytes_to_send =3D p_ptr->max_pkt - hdr_size;
+ if (curr_left < bytes_to_send)
+ bytes_to_send =3D curr_left;
my_iov.iov_base =3D curr_start;
my_iov.iov_len =3D bytes_to_send;
if ((res =3D send_packet(iocb, sock, &my_msg, =
0))
< 0) {
--=20
1.3.3
|