之前讨论过网页爬虫, 本文也是沿用之前思路。 以我的CSDN博客为例, 来获取访问量和积分信息, 代码如下:
#include <stdio.h>
#include <winsock2.h>
#include <time.h>
#include <iostream>
#include <string>
#pragma comment(lib, "ws2_32.lib")
using namespace std;
void unixTime2Str(int n, char strTime[], int bufLen)
{
struct tm tm = *localtime((time_t *)&n);
strftime(strTime, bufLen - 1, "%Y-%m-%d %H:%M:%S", &tm);
strTime[bufLen - 1] = '\0';
}
int main()
{
HOSTENT *pHost = NULL;
WSADATA wsaData;
WSAStartup(MAKEWORD(1,1), &wsaData);
char szWeb[] = "blog.csdn.net"; // csdn
if(NULL == pHost)
{
pHost = gethostbyname(szWeb);
}
const char* pIPAddr = inet_ntoa(*((struct in_addr *)pHost->h_addr)) ;
printf("web server ip is : %s", pIPAddr);
SOCKADDR_IN webServerAddr;
webServerAddr.sin_family = AF_INET;
webServerAddr.sin_addr.S_un.S_addr=inet_addr(pIPAddr);
webServerAddr.sin_port = htons(80);
SOCKET sockClient = socket(AF_INET, SOCK_STREAM, 0);
int nRet = connect(sockClient ,(struct sockaddr*)&webServerAddr, sizeof(webServerAddr));
if(nRet < 0)
{
printf("connect error\n");
return nRet;
}
char szHttpRest[10240] = {0};
sprintf(szHttpRest, "GET /stpeace HTTP/1.1\r\nHost:%s\r\nConnection: Keep-Alive\r\n\r\n", szWeb);
printf("\n-----send is :----\n");
printf("%s\n", szHttpRest);
nRet = send(sockClient , szHttpRest, strlen(szHttpRest) + 1, 0);
if(nRet < 0)
{
printf("send error\n");
return nRet;
}
// 当前时间
char strTime[100] = {0};
int now = time(NULL);
unixTime2Str(now, strTime, sizeof(strTime));
string sRecv;
while(1)
{
char szRecvBuf[2] = {0};
nRet = recv(sockClient ,szRecvBuf, 1 ,0);
if(nRet < 0)
{
printf("recv error\n");
goto LABEL;
}
if(0 == nRet)
{
goto LABEL;
}
sRecv += string(1, szRecvBuf[0]);
}
LABEL:
closesocket(sockClient);
WSACleanup();
unsigned int pos = 0;
pos = sRecv.find("blog_rank");
if(pos != string::npos)
{
printf("%s\n", strTime);
// 如下位置起点和偏移量与博文中访问量和积分这两个数字的位数有关
printf("%s\n", sRecv.substr(pos + 36, 7).c_str());
printf("%s\n", sRecv.substr(pos + 83, 5).c_str());
}
printf("well done\n");
while(1);
return 0;
}
运行结果:
很easy, 话不多说。