I've been trying to diagnose some software problems for a while with VMWare Fusion 8.5 on various network adapters with a macOS 10.12 guest.
Basically, when a tcp connection to a remote host is open and the TCP window hits 0, it is never updated to include a larger window. This directly results in a socket timeout or hang when network speed is high enough to exhaust the tcp receive buffer.
Here is a fully reproducible set of programs to demonstrate the issue. Please pardon the code, I definitely wasn't writing this to be clean--just to demonstrate an issue. They can be compiled with `cc client.c -o client` and `cc server.c -o server`. The programs were directly copied from Linux Howtos: C/C++ -> Sockets Tutorial with modification to keep the connection open forever and to add a timeout.
client.c:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
void error(const char *msg)
{
perror(msg);
exit(0);
}
int main(int argc, char *argv[])
{
fd_set set;
int sockfd, portno, n, total, rv;
struct sockaddr_in serv_addr;
struct hostent *server;
struct timeval timeout;
char buffer[256];
if (argc < 3) {
fprintf(stderr,"usage %s hostname port\n", argv[0]);
exit(0);
}
portno = atoi(argv[2]);
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0)
error("ERROR opening socket");
server = gethostbyname(argv[1]);
if (server == NULL) {
fprintf(stderr,"ERROR, no such host\n");
exit(0);
}
bzero((char *) &serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
bcopy((char *)server->h_addr,
(char *)&serv_addr.sin_addr.s_addr,
server->h_length);
serv_addr.sin_port = htons(portno);
if (connect(sockfd,(struct sockaddr *) &serv_addr,sizeof(serv_addr)) < 0)
error("ERROR connecting");
bzero(buffer, 256);
FD_ZERO(&set);
FD_SET(sockfd, &set);
sleep(1);
timeout.tv_sec = 1;
timeout.tv_usec = 0;
total = 0;
for (;;) {
rv = select(sockfd + 1, &set, NULL, NULL, &timeout);
if (rv == -1) {
perror("select\n");
} else if(rv == 0) {
printf("timeout\n");
break;
} else {
n = read(sockfd, buffer, 256);
if (n < 0)
error("ERROR reading from socket");
total += n;
printf("read %d / %d\n", n, total);
}
}
close(sockfd);
return 0;
}
server.c:
/* A simple server in the internet domain using TCP
The port number is passed as an argument */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
void error(const char *msg)
{
perror(msg);
exit(1);
}
int main(int argc, char *argv[])
{
int sockfd, newsockfd, portno;
socklen_t clilen;
char buffer[1024];
struct sockaddr_in serv_addr, cli_addr;
int n, total;
if (argc < 2) {
fprintf(stderr,"ERROR, no port provided\n");
exit(1);
}
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0)
error("ERROR opening socket");
bzero((char *) &serv_addr, sizeof(serv_addr));
portno = atoi(argv[1]);
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = INADDR_ANY;
serv_addr.sin_port = htons(portno);
if (bind(sockfd, (struct sockaddr *) &serv_addr,
sizeof(serv_addr)) < 0)
error("ERROR on binding");
listen(sockfd,5);
clilen = sizeof(cli_addr);
newsockfd = accept(sockfd,
(struct sockaddr *) &cli_addr,
&clilen);
if (newsockfd < 0)
error("ERROR on accept");
memset(buffer, '0xAB', sizeof(buffer));
total = 0;
for (;;) {
n = write(newsockfd, buffer, sizeof(buffer));
if (n < 0)
error("ERROR writing to socket");
else
total += n;
printf("wrote %d / %d\n", n, total);
}
close(newsockfd);
close(sockfd);
return 0;
}
Usage
Run the server on your host with `./server $port_number` and the client in your vm with `./client $server_ip_address $port_number`. You'll notice a socket timeout pretty quickly.
Diagnosis
If you happen to run Wireshark at the same time, you'll see the tcp window reduce to 0 and a zero window probe being repeatedly sent by the server. The client never recovers.
I believe the issue to be related to the network drivers in the VM as the symptom isn't present when doing a host to host transfer nor a guest to guest transfer. The problem also doesn't occur between hosts on different network nodes.
The problem presented here will cause a myriad of problems including things like git clones failing inside guests as reported by another user as well as failing to download large files as reported by a user on stack overflow 6 months ago.
Version Info
From host:
$ '/Applications/VMware Fusion.app/Contents/Library/vmware-vmx' -v
VMware Fusion Information:
VMware Fusion 8.5.3 build-4696910 Release
From guest:
$ /Library/Application\ Support/VMware\ Tools/vmware-tools-cli --version
10.0.10.3275 (build-4301679)
Questions
1) Is this a known bug and does it have an ETA on resolution?
2) Is there a workaround?
3) Does anyone know of a version of VMWare that doesn't have this bug?