|
蓝森林 http://www.lslnet.com 2006年6月6日 10:18
[求助]有过高效spider开发经验的人进来,解决问题了我请吃饭:)
socket效率问题:抓取http://news.sina.com.cn/ 不做任何分析,光抓取;在linux单cpu 1G内存的机器上,我的结果是:
1.非阻赛I/O + pthread(10个) 一共成功请求1000个耗时300秒
2.阻赛I/O + pthread(10个) 一共成功请求1000个耗时220秒
3.非阻赛I/O + select(一次轮巡10个,30个,50个)基本差不多,成功请求1000次耗时大概在450s左右
4.poll,epoll我还没测,我怀疑是我代码写得有问题,我旁边一哥们用delthpi写了个,URL解析+内容解析每秒能成功请求30个,哪位高手帮忙看看,指点一二。最好实验一下,别想当然,我时间紧迫到poll epoll方式都没法子测试了,希望大家帮忙,谢谢。 |
回复 1楼 sevendays 的帖子
自己顶阿 |
回复 1楼 sevendays 的帖子
大虾们,帮帮忙啊:) |
我贴出源代码,大家伙帮忙看看啊,select的
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <semaphore.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <string>
#include <stdlib.h>
#include <netinet/in.h>
#include <netdb.h>
using namespace std;
#define MAXSOCKET 20
#define _MAXLOOP_ 10
string gethost_byname(const string &hostname)
{
string ip = "";
if ( hostname != "" )
{
struct hostent *he = gethostbyname( hostname.c_str() );
if ( he!=NULL && he->h_addr!=NULL )
ip = inet_ntoa( *((struct in_addr*)he->h_addr) );
}
return ip;
}
int setnonblocking(int sock)
{
int opts;
opts=fcntl(sock,F_GETFL);
if(opts<0)
{
return -1;
}
opts = opts|O_NONBLOCK;
if(fcntl(sock,F_SETFL,opts)<0)
{
perror("fcntl(sock,SETFL,opts)");
return -1;
}
return 0;
}
//send
int sendWebRequest(const int &fd,const char* str)
{
if( fd <= 0 )
return -1;
int lens = 0;
int dlens = 0;
int requestLens = strlen(str);
while(1)
{
//send
lens = send(fd,str + dlens , requestLens - dlens,0);
if( lens < 0 )
{
printf("ERROR:%s",strerror(errno));
fflush(stdout);
return -1;
}
else if( lens == 0 )
{
//server closed
return -1;
}
else
{
dlens += lens;
if( dlens == requestLens )
{
break;
}
else
{
continue;
}
}
}
return 0;
}
int recvBack(const int &fd)
{
char *buf = (char *)malloc(sizeof(char) * 1024);
if( buf == NULL )
return -1;
int rcvn = 0;
char endbuf[5];
memset(endbuf,0x0,5);
while(1)
{
memset(buf,0x0,1024);
rcvn = recv(fd,buf,1024,0);
if( rcvn < 0 )
{
if( errno != EAGAIN )
{
printf("recv error:%s",strerror(errno) );
fflush(stdout);
free(buf);
return -1;
}
else
{
continue;
}
}
else if( rcvn == 0 )
{
break;
}
if( rcvn >= 4 )
{
strncpy(endbuf,buf+(rcvn-4),4);
}
else
{
strncpy(endbuf + (4 - rcvn),buf,rcvn);
}
if( strcmp(endbuf,"\r\n\r\n") == 0 )
break;
}
free(buf);
return 0;
}
int main(void)
{
char request[] = "GET /doc/index.html HTTP/1.1\r\nHOST: bn.sina.com.cn"
"\r\nAccept: */*\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n"
"Pragma: no-cache\r\nCache-Control: no-cache\r\nConnection: close\r\n\r\n";
printf("%s",request);
struct sockaddr_in sin;
sin.sin_family = AF_INET;
sin.sin_port = htons(80);
sin.sin_addr.s_addr = inet_addr( gethost_byname("bn.sina.com.cn").c_str() );
struct timeval tmd;
tmd.tv_sec = 3;
tmd.tv_usec = 0;
struct timeval u_tmd;
fd_set wset;
fd_set w_set;
fd_set rset;
fd_set r_set;
FD_ZERO(&w_set);
FD_ZERO(&r_set);
FD_ZERO(&wset);
FD_ZERO(&rset);
int fwcount = 0;
int frcount = 0;
int clientfd[MAXSOCKET];
//处理
long start_secs = time(NULL);
int goodCount = 0;
for( unsigned int i = 0; i < _MAXLOOP_ ; i ++ )
{
//连接处理
frcount = 0;
fwcount = 0;
FD_ZERO(&w_set);
FD_ZERO(&r_set);
FD_ZERO(&wset);
FD_ZERO(&rset);
int maxfd = 0;
for( int j = 0; j < MAXSOCKET; j ++ )
{
clientfd[j] = -1;
int fd = socket(AF_INET,SOCK_STREAM,0);
if ( fd <= 0 )
continue;
if( setnonblocking(fd) == -1 )
{
close(fd);
continue;
}
clientfd[j] = fd;
if( connect(fd,(struct sockaddr*)&sin,sizeof(sin)) < 0 )
{
if( errno != EINPROGRESS )
{
close(fd);
clientfd[j] = -1;
printf("connect failture[%s]\n",strerror(errno) );
continue;
}
fwcount ++;
FD_SET(fd,&wset);
if( maxfd < fd )
maxfd = fd;
}
else
{
//发送请求
if( sendWebRequest(fd,request) == 0 )
{
frcount ++;
FD_SET(fd,&rset);
}
else
{
close(fd);
clientfd[j] = -1;
}
}
}
//select 处理
while(1)
{
if( fwcount <= 0 && frcount <= 0 )
{
break;
}
FD_ZERO(&w_set);
FD_ZERO(&r_set);
w_set = wset;
r_set = rset;
u_tmd.tv_sec = tmd.tv_sec;
u_tmd.tv_usec = tmd.tv_usec;
int nsel = select(maxfd + 1,&r_set,&w_set,NULL,&u_tmd);
if( nsel == 0 )
{
printf("select time out\n");
break;
}
else if( nsel < 0 )
{
printf("select error:%s\n",strerror(errno) );
break;
}
for( int p = 0; p < MAXSOCKET; p++ )
{
if( clientfd[p] <= 0 )
continue;
//判断错误
int errorflag = 0;
int n = sizeof(errorflag);
if( getsockopt(clientfd[p],SOL_SOCKET,SO_ERROR,&errorflag,(socklen_t *)&n) < 0 ||
errorflag != 0 )
{
printf("socket error\n");
if( FD_ISSET(clientfd[p],&rset) )
{
FD_CLR(clientfd[p],&rset);
frcount --;
}
if( FD_ISSET(clientfd[p],&wset) )
{
FD_CLR(clientfd[p],&wset);
fwcount --;
}
clientfd[p] = -1;
continue;
}
if( FD_ISSET(clientfd[p],&w_set) )
{
//发送请求
if( sendWebRequest(clientfd[p],request) == 0 )
{
frcount ++;
FD_SET(clientfd[p],&rset);
fwcount --;
FD_CLR(clientfd[p],&wset);
}
else
{
fwcount --;
FD_CLR(clientfd[p],&wset);
close(clientfd[p]);
clientfd[p] = -1;
}
}
if( FD_ISSET(clientfd[p],&r_set) )
{
//接收
int reflag = 0;
if( (reflag = recvBack(clientfd[p]) ) == 0 )
{
//成功
goodCount ++;
}
//接收失败
FD_CLR(clientfd[p],&rset);
frcount --;
close(clientfd[p]);
clientfd[p] = -1;
continue;
}
}
}
}
start_secs = time(NULL) - start_secs;
cout << "leftTime=" << start_secs << " s dealCount="
<< MAXSOCKET * _MAXLOOP_ << ",sucucess=" << goodCount << endl;
return 0;
} |
回复 1楼 sevendays 的帖子
哎,没人回复!只好自己测试了,按照http协议,发送请求的包为keep-alive 终于快多了,但是速度还是不理想.按照keep-alive的connection,每秒能处理30个左右,但是还能提高,把发送和接收改成异步的吧 |
顶一下,只有学习的份了。。请问keep-alive 是啥含义,一直没弄清楚。。。 |
回复 6楼 chenyajun5 的帖子
keep-alive跟socket没关系,是http协议的一个连接状态控制,我之所以抓得慢,应该是犯了个错误,
http请求包中我一直用:
GET /jlp/2006-02-06/104814158.html HTTP/1.1\r\nHOST: bn.sina.com.cn\r\nAccept: */*\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\nPragma: no-cache\r\nCache-Control: no-cache\r\nConnection: close\r\n\r\n
这样的话就的每次都得重新连接
改成keep-alive就可在一个连接上多次发送请求包
只怪我自己做什么都太喜欢拿现成的,没仔细看........。
可以长连接的话发送和接收就可以异步了,修改程序结构 |
| |