项目地址请看: https://github.com/YongHaoWu/http_client
首先,我们想想浏览器做了什么事情, 无非就是根据输入 url,发送请求到对方服务器, 获取相应的文件,展示内容给用户。
先回顾下我一篇文章里讲解的 URI Scheme
。
Uniform Resource Identifier (URI,统一资源标志符) ,是用来标识资源的字符串,规定了以下文法:
上图可见,有效的 URI
至少包含 scheme
,:
以及 path(路径)
,如经典的触发操作系统发邮件动作的mailto 格式:
mailto:[email protected]
└─┬──┘ └────┬─────────────┘
scheme path
其他就是可选的组合路径了,如scheme://host path ? query # fragment
就会我们常见的 url 格式:
userinfo host port
┌──┴───┐ ┌──────┴──────┐ ┌┴┐
https://[email protected]:123/forum/questions/?tag=networking&order=newest#top
└─┬─┘ └───────────┬──────────────┘└───────┬───────┘ └───────────┬─────────────┘ └┬┘
scheme authority path query fragment
hostname
。比如这个 url https://stackoverflow.com/questions/tagged/elixir
,hostname
就是 stackoverflow.com
, 其他的不过是 URL
的一部分。 程序中的extract_hostname
函数便是做这个事情,下面的代码处理了很多边角情况:
int is_http_or_https_scheme(char *url, int url_len) {
if(url_len >= 7 && (strncmp(url, "http://", 7) == 0 || strncmp(url, "https://", 8) == 0)) {
return 1;
}else {
return 0;
}
}
char* extract_hostname(char *url) {
int url_len = strlen(url);
if(is_http_or_https_scheme(url, url_len)) {
int i = 0;
while(url[i]!='/' && i!=url_len) {
//skip scheme
i += 1;
}
int host_end = i+2;
while(url[host_end] != '/') {
++host_end;
}
int www_len = strncmp(url, "www.", 4) == 0?4:0;
int len = host_end - (i+2) - www_len;
char *ret = malloc(len);
strncpy(ret, url + i+2 + www_len, len);
printf("schema: http://abc.com/abc \n ret is %s\n", ret);
return ret;
}else {
// abc.com/abc
// www.abc.com/abc
char *ret;
if(strncmp(url, "www.", 4) == 0) {
ret = malloc(url_len - 4);
strncpy(ret, url+4, url_len - 4);
}else{
ret = malloc(url_len - 1);
strncpy(ret, url, url_len);
}
printf("schema: abc.com/abc \n ret is %s\n", ret);
return ret;
}
}
函数 getIPFromDNS
便是做这个事情, 主要调用 linux 的 gethostbyname 即可解析dns
, 得到一个 ip 数组, 通常选一个即可。
The gethostbyname() function returns a structure of type hostent for the given host name.
The hostent structure is defined in <netdb.h> as follows:
struct hostent {
char *h_name; /* official name of host */
char **h_aliases; /* alias list */
int h_addrtype; /* host address type */
int h_length; /* length of address */
char **h_addr_list; /* list of addresses */
}
#define h_addr h_addr_list[0] /* for backward compatibility */
所以h_name
就是正式 host 名,h_addr_list
就是解析到的此 host 的 ip 地址数组。
char* getIPFromDNS(char *host) {
struct hostent *server = gethostbyname(host);
char *address = (char*)malloc(50);
if (server == NULL) {
fprintf(stderr,"ERROR, no such host\n");
}else {
printf("server 's hostname: %s\n\n", server->h_name);
struct in_addr **addr_list = (struct in_addr **) server->h_addr_list;
for(int i = 0; addr_list[i] != NULL; i++) {
strcpy(address, inet_ntoa(*addr_list[i]) );
printf(" 解析到的 ip 地址为: IP ADDRESS->%s\n", address);
}
}
return address;
}
函数init_serv_addr
配置 socket
的信息, 如使用 ipv4
, 用 80 端口, 访问哪个 ip
,端口是多少等:
struct sockaddr_in init_serv_addr(char *address, int port) {
struct sockaddr_in serv_addr;
memset(&serv_addr, 0, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = inet_addr(address);
serv_addr.sin_port = htons(port);
return serv_addr;
}
struct sockaddr_in serv_addr = init_serv_addr(address, 80);
int sock = socket(AF_INET, SOCK_STREAM, 0);
if (connect(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0) {
printf("connect err");
return 0;
}
连接 socket
后, generate_request
生成 http request
, 注意第一行即描述了使用 GET
协议, HTTP 1.1
版本, HOST
头部是必须的, 因为大多数 web 服务器都设置了虚拟主机, 也就是根据 HOST
来 redirect 你到不同的 地方,所以你 http header
里不写 host,仅仅使用 ip 是不行的,比如百度等。
void generate_request(char *hostname, char *url, char *request_body)
{
// 注意这里有坑, http 协议规定 header 参数不能有空格, 所以下面必须没有空格
// 另外, HOST 这个头部参数为了应对 server_name,也是必须的
sprintf(request_body, "\
GET / HTTP/1.1\r\n\
HOST: %s\r\n\
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36\r\n\
Cache-Control: no-cache\r\n\r\n", hostname);
// 另一种办法, 但调用函数太多次了, 效率相对不高
/* strcat(request_body, "GET "); */
/* strcat(request_body, "/"); */
/* strcat(request_body, " HTTP/1.1\r\n"); */
/* strcat(request_body, "HOST: "); */
/* strcat(request_body, hostname); */
/* strcat(request_body, "\r\n"); */
/* strcat(request_body,"Cache-Control: no-cache\r\n\r\n"); */
printf("-> HTTP 请求报文如下\n--------HTTP Request--------\n%s--------\n", request_body);
}
total
大小的 body
在发完之前一直写 socket
,没有写过 linux 网络程序的人可以留意一下,写了多少是由 socket 自身决定的,程序控制不了,所以要 write 后才得知写了多少,然后调整下次发送的内容。
/* send the request */
int total = strlen(request_body);
int sent = 0;
do {
printf("total is %d \n", total);
int bytes = write(sock, request_body + sent, total - sent);
if (bytes < 0)
printf("ERROR writing message to socket");
if (bytes == 0)
break;
printf("wrote bytes %d \n", bytes);
sent += bytes;
} while (sent < total);
收到 response
的头部后(根据\r\n\r\n
划分), 解析出 Content-Length
, 接着收剩下的内容:
char resp[10*1024];
memset(resp, 0, sizeof(resp));
total = sizeof(resp)-1;
int received = 0;
int content_len = 0;
int body_size = 0;
do {
content_len = 0;
printf("---------------\n");
printf("start received: %d, total: %d, total - received= %d \n", received, total,
total-received);
int bytes = recv(sock, resp+received, total-received, 0);
printf("received bytes %d \n", bytes);
if (bytes < 0) {
printf("ERROR reading resp from socket");
}
if (bytes == 0) {
printf("received bytes 0, break\n");
break;
}
received += bytes;
/* printf("\n------- resp ------- \n%s\n", resp); */
printf("-------- sizeof(resp) %lu\n", sizeof(resp));
int header_size = 0;
for(; header_size<sizeof(resp) &&
(strncmp(resp+header_size, "\r\n\r\n", strlen("\r\n\r\n")) != 0); ++header_size) {
if(strncmp(resp+header_size, "Content-Length:", strlen("Content-Length:")) == 0) {
int tmp = header_size + strlen("Content-Length:") + 1;
while(resp[tmp] != '\r') {
content_len = content_len*10 + (resp[tmp] - '0');
tmp++;
}
}
printf("%c", resp[header_size]);
}
printf("\ncontent_len %d\n", content_len);
/* printf("\n\n&&&&&&&&&&&&&&&&&&\n"); */
/* for(int i=header_size; i!=total; ++i) { */
/* printf("%c", resp[i]); */
/* } */
header_size += strlen("\r\n\r\n");
body_size = content_len + header_size;
printf("header size %d\n", header_size);
printf("\nbody_size %d\n", body_size);
printf("end received: %d, total: %d, total - received= %d \n", received, total,
total-received);
} while(received != body_size && received < total);
至此, 一个简单的 http client 完成.
http://example.com 解析到的 ip 地址为: IP ADDRESS->93.184.216.34
HTTP 请求报文如下
--------HTTP Request--------
GET / HTTP/1.1
HOST: example.com
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36
Cache-Control: no-cache
服务器 response 回复的头部为:
HTTP/1.1 200 OK
Cache-Control: max-age=604800
Content-Type: text/html
Date: Thu, 21 Jun 2018 10:35:29 GMT
Etag: "1541025663+ident"
Expires: Thu, 28 Jun 2018 10:35:29 GMT
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
Server: ECS (oxr/8313)
Vary: Accept-Encoding
X-Cache: HIT
Content-Length: 1270
接着的内容是:
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 50px;
background-color: #fff;
border-radius: 1em;
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
body {
background-color: #fff;
}
div {
width: auto;
margin: 0 auto;
border-radius: 0;
padding: 1em;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is established to be used for illustrative examples in documents. You may use this
domain in examples without prior coordination or asking for permission.</p>
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
以上是网页内容了。
接下来可以做的有:
本文只是抛砖引玉; P
在下在 Shopee 工作,觉得水深火热不喜欢加班的同学可以考虑一下
拒绝 996,那就来 shopee,待遇 work life balance 两不: https://www.v2ex.com/t/672561#reply1
1
sadfQED2 2020-05-25 12:51:58 +08:00 via Android
???还有这样招人的
|
2
Lax 2020-05-25 13:03:07 +08:00
不加班造钉子???
为啥要单独处理一下 `www.` |
3
ChristopherWu OP @sadfQED2 🤭 内推也要发有意义的东西
|
4
ChristopherWu OP @Lax 两年前的个人项目,学习 http 协议写的。单独处理 www 是想把 hostname 拿出来
|
5
Alex5467 2020-05-25 15:07:58 +08:00
哈哈哈,他之前还发过一个,我看到最后也是发现在招人
|
6
Alex5467 2020-05-25 15:08:23 +08:00
|
7
ChristopherWu OP @Alex5467 新套路😈
|