程序代码1
#include <iostream>
#include <iomanip>
#include <windows.h>
#include <wininet.h>
#include <regex>
#include <sstream>
#include <fstream>
#pragma comment(lib,"WinInet.lib")
using namespace std;
string HttpRequest(string strUrl, string strMethod = "GET", string strPostData = "")
{
BOOL bRet;
short sPort = 80;
char* lpHostName, * lpUrl, * lpMethod, * lpPostData;
int pos, nPostDataLen = strPostData.size();
string tmpUrl, strResponse = "";
while ((pos = strUrl.find(" ")) != strUrl.npos)
strUrl.erase(pos, 1);
if (strUrl.substr(0, 7) == "http://")
strUrl = strUrl.substr(7, strUrl.size() - 1);
if (strUrl.substr(0, 8) == "https://")
strUrl = strUrl.substr(8, strUrl.size() - 1);
if (strUrl.empty())
return strResponse;
if ((pos = strUrl.find("/")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
}
else tmpUrl = "/";
lpUrl = (char*)tmpUrl.data();
lpMethod = (char*)strMethod.data();
lpHostName = (char*)strUrl.data();
lpPostData = (char*)strPostData.data();
if ((pos = strUrl.find(":")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
sPort = (short)atoi(tmpUrl.c_str());
}
HINTERNET hInternet, hConnect, hRequest;
hInternet = hConnect = hRequest = NULL;
hInternet = (HINSTANCE)InternetOpen("User-Agent", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (!hInternet)
{
return strResponse;
}
hConnect = (HINSTANCE)InternetConnect(hInternet, lpHostName, sPort, NULL, "HTTP/1.1", INTERNET_SERVICE_HTTP, 0, 0);
if (!hConnect)
{
if (hInternet) InternetCloseHandle(hInternet);
return strResponse;
}
hRequest = (HINSTANCE)HttpOpenRequest(hConnect, lpMethod, lpUrl, "HTTP/1.1", NULL, NULL, INTERNET_FLAG_RELOAD, 0);
if (!hRequest)
{
if (hInternet) InternetCloseHandle(hInternet);
if (hConnect) InternetCloseHandle(hConnect);
return strResponse;
}
bRet = HttpSendRequest(hRequest, NULL, 0, lpPostData, nPostDataLen);
while (true)
{
char cReadBuffer[4096];
unsigned long lNumberOfBytesRead;
bRet = InternetReadFile(hRequest, cReadBuffer, sizeof(cReadBuffer) - 1, &lNumberOfBytesRead);
if (!bRet || !lNumberOfBytesRead) break;
cReadBuffer[lNumberOfBytesRead] = 0;
strResponse = strResponse + cReadBuffer;
}
if (hRequest) InternetCloseHandle(hRequest);
if (hConnect) InternetCloseHandle(hConnect);
if (hInternet) InternetCloseHandle(hInternet);
return strResponse;
}
bool DownloadFile(string strUrl, string strFullFileName, unsigned int BUF_SIZE_KB = 1)
{
size_t pos = 0;
short sPort = 80;
BOOL bRet;
char* lpHostName, * lpUrl;
string tmpUrl, strResponse = "";
while ((pos = strUrl.find(" ")) != strUrl.npos)
strUrl.erase(pos, 1);
if (strUrl.substr(0, 7) == "http://")
strUrl = strUrl.substr(7, strUrl.size() - 1);
if (strUrl.substr(0, 8) == "https://")
strUrl = strUrl.substr(8, strUrl.size() - 1);
if (strUrl.empty()) return false;
if ((pos = strUrl.find("/")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
}
else tmpUrl = "/";
lpUrl = (char*)tmpUrl.data();
lpHostName = (char*)strUrl.data();
if ((pos = strUrl.find(":")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
sPort = (short)atoi(tmpUrl.c_str());
}
HINTERNET hInternet, hConnect, hRequest;
hInternet = hConnect = hRequest = NULL;
hInternet = (HINSTANCE)InternetOpen("User-Agent", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (!hInternet) return false;
hConnect = (HINSTANCE)InternetConnect(hInternet, lpHostName, sPort, NULL, "HTTP/1.1", INTERNET_SERVICE_HTTP, 0, 0);
if (!hConnect)
{
if (hInternet) InternetCloseHandle(hInternet);
return false;
}
char lpMethod[] = "GET";
hRequest = (HINSTANCE)HttpOpenRequest(hConnect,lpMethod, lpUrl, "HTTP/1.1", NULL, NULL, INTERNET_FLAG_RELOAD, 0);
if (!hRequest)
{
if (hInternet) InternetCloseHandle(hInternet);
if (hConnect) InternetCloseHandle(hConnect);
return false;
}
bRet = HttpSendRequest(hRequest, NULL, 0, NULL, 0);
BUF_SIZE_KB = (BUF_SIZE_KB == 0) ? 1024 : BUF_SIZE_KB * 1024;
char* buf = new char[BUF_SIZE_KB];
DWORD buf_len, buf_read;
buf_len = buf_read = BUF_SIZE_KB;
FILE* fp = fopen(strFullFileName.c_str(), "wb");
while (true)
{
InternetReadFile(hRequest, buf, buf_len, &buf_read);
if (buf_read == 0) break;
fwrite(buf, 1, buf_read, fp);
}
delete[] buf;
fclose(fp);
if (hRequest) InternetCloseHandle(hRequest);
if (hConnect) InternetCloseHandle(hConnect);
if (hInternet) InternetCloseHandle(hInternet);
return true;
}
int regexSplit(string& str, const string str_reg, vector<string>& vect, int pos = 0)
{
if (pos != -1) pos = 0;
regex myPattern(str_reg);
sregex_token_iterator it(str.begin(), str.end(), myPattern, pos);
sregex_token_iterator end;
for (; it != end; ++it) vect.push_back(*it);
return vect.size();
}
string replaceAll(string& s, const string sub1, const string sub2)
{
size_t len, pos = 0;
if (s.empty() || sub1.empty() || sub2.empty()) return s;
if (s.find(sub1) == s.npos) return s;
len = sub1.size();
while ((pos = s.find(sub1, pos)) != s.npos)
s.replace(pos++, len, sub2);
return s;
}
string int2str(int i)
{
string s;
stringstream ss;
ss << setw(5) << setfill('0') << i;
s = ss.str();
ss.clear();
return s;
}
int main()
{
int i = 0;
bool bRet;
size_t pos;
string Html, url, reg;
vector <string> vect, vimg;
url = "https://esports.zol.com.cn/slide/754/7542170_1.html#p=2";
Html = HttpRequest(url);
cout << "1-" << endl;
while ((pos = Html.find("tutie")) != string::npos) {
reg = "t_s1280x720(.*?).jpg";
regexSplit(Html, reg, vect);
for (auto& v : vect) {
replaceAll(v, "\\/", "/");
url = "http://article-fd.zol-img.com.cn/";
bRet = DownloadFile(url + v, "C:\\Users\\18049\\Desktop\\pic\\pic" + int2str(++i) + ".jpg");
if (bRet) {
cout << ".";
if (i % 20 == 0) cout << endl;
if (i % 100 == 0) {
system("cls");
cout << i + 1 << "-" << endl;
}
vimg.push_back(url + v);
if (i >= 10000) return 0;
}
}
reg = "<a href=\"/slide/(.*?)tutie";
vect.clear();
regexSplit(Html, reg, vect);
url = vect.at(0);
url = "https://esports.zol.com.cn" + url.substr(9, url.size() - 23);
vect.clear();
Html = HttpRequest(url);
}
return 0;
}
程序代码2 - 封装成类
初次运行爬取速度会很慢,第二次就好很多了
#include <iostream>
#include <iomanip>
#include <windows.h>
#include <wininet.h>
#include <regex>
#include <sstream>
#include <fstream>
#pragma comment(lib,"WinInet.lib")
using namespace std;
class PictureDownloader
{
public:
PictureDownloader();
string HttpRequest(string strUrl, string strMethod = "GET", string strPostData = "");
bool DownloadFile(string strUrl, string strFullFileName, unsigned int BUF_SIZE_KB = 1);
int regexSplit(string& str, const string str_reg, vector<string>& vect, int pos = 0);
string replaceAll(string& s, const string sub1, const string sub2);
string int2str(int i);
};
PictureDownloader::PictureDownloader()
{
int i = 0;
bool bRet;
size_t pos;
string Html, url, reg;
vector <string> vect, vimg;
url = "https://esports.zol.com.cn/slide/754/7542170_1.html#p=2";
Html = HttpRequest(url);
cout << "1-" << endl;
while ((pos = Html.find("tutie")) != string::npos)
{
reg = "t_s1280x720(.*?).jpg";
regexSplit(Html, reg, vect);
for (auto& v : vect)
{
replaceAll(v, "\\/", "/");
url = "http://article-fd.zol-img.com.cn/";
bRet = DownloadFile(url + v, "C:\\Users\\18049\\Desktop\\pic\\pic" + int2str(++i) + ".jpg");
if (bRet)
{
cout << ".";
if (i % 20 == 0) cout << endl;
if (i % 100 == 0) {
system("cls");
cout << i + 1 << "-" << endl;
}
vimg.push_back(url + v);
if (i >= 10000) exit(0);
}
}
reg = "<a href=\"/slide/(.*?)tutie";
vect.clear();
regexSplit(Html, reg, vect);
url = vect.at(0);
url = "https://esports.zol.com.cn" + url.substr(9, url.size() - 23);
vect.clear();
Html = HttpRequest(url);
}
}
string PictureDownloader::HttpRequest(string strUrl, string strMethod, string strPostData)
{
BOOL bRet;
short sPort = 80;
char* lpHostName, * lpUrl, * lpMethod, * lpPostData;
int pos, nPostDataLen = strPostData.size();
string tmpUrl, strResponse = "";
while ((pos = strUrl.find(" ")) != strUrl.npos)
strUrl.erase(pos, 1);
if (strUrl.substr(0, 7) == "http://")
strUrl = strUrl.substr(7, strUrl.size() - 1);
if (strUrl.substr(0, 8) == "https://")
strUrl = strUrl.substr(8, strUrl.size() - 1);
if (strUrl.empty())
return strResponse;
if ((pos = strUrl.find("/")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
}
else tmpUrl = "/";
lpUrl = (char*)tmpUrl.data();
lpMethod = (char*)strMethod.data();
lpHostName = (char*)strUrl.data();
lpPostData = (char*)strPostData.data();
if ((pos = strUrl.find(":")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
sPort = (short)atoi(tmpUrl.c_str());
}
HINTERNET hInternet, hConnect, hRequest;
hInternet = hConnect = hRequest = NULL;
hInternet = (HINSTANCE)InternetOpen("User-Agent", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (!hInternet)
{
return strResponse;
}
hConnect = (HINSTANCE)InternetConnect(hInternet, lpHostName, sPort, NULL, "HTTP/1.1", INTERNET_SERVICE_HTTP, 0, 0);
if (!hConnect)
{
if (hInternet) InternetCloseHandle(hInternet);
return strResponse;
}
hRequest = (HINSTANCE)HttpOpenRequest(hConnect, lpMethod, lpUrl, "HTTP/1.1", NULL, NULL, INTERNET_FLAG_RELOAD, 0);
if (!hRequest)
{
if (hInternet) InternetCloseHandle(hInternet);
if (hConnect) InternetCloseHandle(hConnect);
return strResponse;
}
bRet = HttpSendRequest(hRequest, NULL, 0, lpPostData, nPostDataLen);
while (true)
{
char cReadBuffer[4096];
unsigned long lNumberOfBytesRead;
bRet = InternetReadFile(hRequest, cReadBuffer, sizeof(cReadBuffer) - 1, &lNumberOfBytesRead);
if (!bRet || !lNumberOfBytesRead) break;
cReadBuffer[lNumberOfBytesRead] = 0;
strResponse = strResponse + cReadBuffer;
}
if (hRequest) InternetCloseHandle(hRequest);
if (hConnect) InternetCloseHandle(hConnect);
if (hInternet) InternetCloseHandle(hInternet);
return strResponse;
}
bool PictureDownloader::DownloadFile(string strUrl, string strFullFileName, unsigned int BUF_SIZE_KB)
{
size_t pos = 0;
short sPort = 80;
BOOL bRet;
char* lpHostName, * lpUrl;
string tmpUrl, strResponse = "";
while ((pos = strUrl.find(" ")) != strUrl.npos)
strUrl.erase(pos, 1);
if (strUrl.substr(0, 7) == "http://")
strUrl = strUrl.substr(7, strUrl.size() - 1);
if (strUrl.substr(0, 8) == "https://")
strUrl = strUrl.substr(8, strUrl.size() - 1);
if (strUrl.empty()) return false;
if ((pos = strUrl.find("/")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
}
else tmpUrl = "/";
lpUrl = (char*)tmpUrl.data();
lpHostName = (char*)strUrl.data();
if ((pos = strUrl.find(":")) != strUrl.npos)
{
tmpUrl = strUrl.substr(pos + 1, strUrl.size() - 1);
strUrl = strUrl.substr(0, pos);
sPort = (short)atoi(tmpUrl.c_str());
}
HINTERNET hInternet, hConnect, hRequest;
hInternet = hConnect = hRequest = NULL;
hInternet = (HINSTANCE)InternetOpen("User-Agent", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (!hInternet) return false;
hConnect = (HINSTANCE)InternetConnect(hInternet, lpHostName, sPort, NULL, "HTTP/1.1", INTERNET_SERVICE_HTTP, 0, 0);
if (!hConnect)
{
if (hInternet) InternetCloseHandle(hInternet);
return false;
}
char lpMethod[] = "GET";
hRequest = (HINSTANCE)HttpOpenRequest(hConnect,lpMethod, lpUrl, "HTTP/1.1", NULL, NULL, INTERNET_FLAG_RELOAD, 0);
if (!hRequest)
{
if (hInternet) InternetCloseHandle(hInternet);
if (hConnect) InternetCloseHandle(hConnect);
return false;
}
bRet = HttpSendRequest(hRequest, NULL, 0, NULL, 0);
BUF_SIZE_KB = (BUF_SIZE_KB == 0) ? 1024 : BUF_SIZE_KB * 1024;
char* buf = new char[BUF_SIZE_KB];
DWORD buf_len, buf_read;
buf_len = buf_read = BUF_SIZE_KB;
FILE* fp = fopen(strFullFileName.c_str(), "wb");
while (true)
{
InternetReadFile(hRequest, buf, buf_len, &buf_read);
if (buf_read == 0) break;
fwrite(buf, 1, buf_read, fp);
}
delete[] buf;
fclose(fp);
if (hRequest) InternetCloseHandle(hRequest);
if (hConnect) InternetCloseHandle(hConnect);
if (hInternet) InternetCloseHandle(hInternet);
return true;
}
int PictureDownloader::regexSplit(string& str, const string str_reg, vector<string>& vect, int pos)
{
if (pos != -1) pos = 0;
regex myPattern(str_reg);
sregex_token_iterator it(str.begin(), str.end(), myPattern, pos);
sregex_token_iterator end;
for (; it != end; ++it) vect.push_back(*it);
return vect.size();
}
string PictureDownloader::replaceAll(string& s, const string sub1, const string sub2)
{
size_t len, pos = 0;
if (s.empty() || sub1.empty() || sub2.empty()) return s;
if (s.find(sub1) == s.npos) return s;
len = sub1.size();
while ((pos = s.find(sub1, pos)) != s.npos)
s.replace(pos++, len, sub2);
return s;
}
string PictureDownloader::int2str(int i)
{
string s;
stringstream ss;
ss << setw(5) << setfill('0') << i;
s = ss.str();
ss.clear();
return s;
}
int main()
{
PictureDownloader pd;
return 0;
}
更改配置
- 用Visual Studio 2019写代码前需要更改项目属性:
- 添加
_CRT_SECURE_NO_DEPRECATE 到预处理器定义中  - 把
字符集 由默认的Unicode字符集 改为多字节字符集 
- 用 Dev-C++ 写代码前需要更改编译选项(Compiler Option),在"编译时加入以下命令"和"在连接器命令行加入以下命令"的栏中加入命令
-std=c++11 -lwininet :(其实我没搞懂到底应该放在哪个栏里,两个都放进去之后,重启了DevC++才能实现编译无错…) 
|