V2EX = way to explore

V2EX 是一个关于分享和探索的地方

现在注册

已注册用户请登录

• 请不要在回答技术问题时复制粘贴 AI 生成的内容

这是一个创建于 1719 天前的主题，其中的信息可能已经有所发展或是发生改变。

项目地址请看： https://github.com/YongHaoWu/http_client

首先，我们想想浏览器做了什么事情，无非就是根据输入 url，发送请求到对方服务器，获取相应的文件，展示内容给用户。

先回顾下我一篇文章里讲解的 URI Scheme。

什么是 URI ？

Uniform Resource Identifier (URI，统一资源标志符) ，是用来标识资源的字符串，规定了以下文法：

上图可见，有效的 URI 至少包含 scheme，:以及 path(路径)，如经典的触发操作系统发邮件动作的mailto 格式：

  mailto:[email protected]
  └─┬──┘ └────┬─────────────┘
  scheme     path

其他就是可选的组合路径了，如scheme://host path ? query # fragment就会我们常见的 url 格式：

          userinfo       host      port
          ┌──┴───┐ ┌──────┴──────┐ ┌┴┐
  https://[email protected]:123/forum/questions/?tag=networking&order=newest#top
  └─┬─┘   └───────────┬──────────────┘└───────┬───────┘ └───────────┬─────────────┘ └┬┘
  scheme          authority                  path                 query           fragment

根据输入的 url 地址，解析出主机地址 `hostname`。

比如这个 url https://stackoverflow.com/questions/tagged/elixir，hostname就是 stackoverflow.com，其他的不过是 URL 的一部分。程序中的extract_hostname 函数便是做这个事情，下面的代码处理了很多边角情况：

int is_http_or_https_scheme(char *url, int url_len) {
    if(url_len >= 7 && (strncmp(url, "http://", 7) == 0 || strncmp(url, "https://", 8) == 0)) {
        return 1;
    }else {
        return 0;
    }
}

char* extract_hostname(char *url) {
    int url_len = strlen(url);

    if(is_http_or_https_scheme(url, url_len)) {
        int i = 0;

        while(url[i]!='/' && i!=url_len) {
        //skip scheme
            i += 1;
        }

        int host_end = i+2;
        while(url[host_end] != '/') {
            ++host_end;
        }
        int www_len = strncmp(url, "www.", 4) == 0?4:0;
        int len = host_end - (i+2) - www_len;
        char *ret = malloc(len);
        strncpy(ret, url + i+2 + www_len, len);
        printf("schema: http://abc.com/abc \n ret is %s\n",  ret);
        return ret;
    }else {
        // abc.com/abc
        // www.abc.com/abc
        char *ret;
        if(strncmp(url, "www.", 4) == 0) {
            ret = malloc(url_len - 4);
            strncpy(ret, url+4, url_len - 4);
        }else{
            ret = malloc(url_len - 1);
            strncpy(ret, url, url_len);
        }
        printf("schema: abc.com/abc \n ret is %s\n",  ret);
        return ret;
    }
}

把 hostname 解析成 ip 地址

函数 getIPFromDNS 便是做这个事情，主要调用 linux 的 gethostbyname 即可解析dns，得到一个 ip 数组, 通常选一个即可。

The gethostbyname() function returns a structure of type hostent for the given host name.

       The hostent structure is defined in <netdb.h> as follows:

           struct hostent {
               char  *h_name;            /* official name of host */
               char **h_aliases;         /* alias list */
               int    h_addrtype;        /* host address type */
               int    h_length;          /* length of address */
               char **h_addr_list;       /* list of addresses */
           }
           #define h_addr h_addr_list[0] /* for backward compatibility */

所以h_name就是正式 host 名，h_addr_list就是解析到的此 host 的 ip 地址数组。

char* getIPFromDNS(char *host) {
    struct hostent *server = gethostbyname(host);
    char *address = (char*)malloc(50);
    if (server == NULL) {
        fprintf(stderr,"ERROR, no such host\n");
    }else {
        printf("server 's hostname: %s\n\n", server->h_name);

        struct in_addr **addr_list = (struct in_addr **) server->h_addr_list;
        for(int i = 0; addr_list[i] != NULL; i++) {
            strcpy(address, inet_ntoa(*addr_list[i]) );
            printf(" 解析到的 ip 地址为: IP ADDRESS->%s\n", address);
        }
    }
    return address;
}

配置 socket 的信息

函数init_serv_addr配置 socket 的信息，如使用 ipv4, 用 80 端口，访问哪个 ip，端口是多少等：

struct sockaddr_in init_serv_addr(char *address, int port) {
    struct sockaddr_in serv_addr;
    memset(&serv_addr, 0, sizeof(serv_addr));
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_addr.s_addr = inet_addr(address);
    serv_addr.sin_port = htons(port);
    return serv_addr;
}
    struct sockaddr_in serv_addr = init_serv_addr(address, 80);

    int sock = socket(AF_INET, SOCK_STREAM, 0);
    if (connect(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) < 0) {
        printf("connect err");
        return 0;
    }

生成 http request 头部

连接 socket后， generate_request 生成 http request ，注意第一行即描述了使用 GET 协议, HTTP 1.1 版本， HOST 头部是必须的，因为大多数 web 服务器都设置了虚拟主机，也就是根据 HOST 来 redirect 你到不同的地方，所以你 http header 里不写 host，仅仅使用 ip 是不行的，比如百度等。

void generate_request(char *hostname, char *url, char *request_body)
{
    // 注意这里有坑, http 协议规定 header 参数不能有空格, 所以下面必须没有空格
    //  另外, HOST 这个头部参数为了应对 server_name,也是必须的

    sprintf(request_body, "\
GET / HTTP/1.1\r\n\
HOST: %s\r\n\
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36\r\n\
Cache-Control: no-cache\r\n\r\n", hostname);
    // 另一种办法, 但调用函数太多次了, 效率相对不高
    /* strcat(request_body, "GET "); */
	/* strcat(request_body, "/"); */
	/* strcat(request_body, " HTTP/1.1\r\n"); */
	/* strcat(request_body, "HOST: "); */
	/* strcat(request_body, hostname); */
	/* strcat(request_body, "\r\n"); */
	/* strcat(request_body,"Cache-Control: no-cache\r\n\r\n"); */
	printf("-> HTTP 请求报文如下\n--------HTTP Request--------\n%s--------\n", request_body);
}

写 socket，发送 request

total大小的 body 在发完之前一直写 socket，没有写过 linux 网络程序的人可以留意一下，写了多少是由 socket 自身决定的，程序控制不了，所以要 write 后才得知写了多少，然后调整下次发送的内容。

    /* send the request */
    int total = strlen(request_body);
    int sent = 0;
    do {
        printf("total is %d \n", total);
        int bytes = write(sock, request_body + sent, total - sent);
        if (bytes < 0)
            printf("ERROR writing message to socket");
        if (bytes == 0)
            break;
        printf("wrote bytes %d \n", bytes);
        sent += bytes;
    } while (sent < total);

处理对方的返回

收到 response 的头部后(根据\r\n\r\n 划分), 解析出 Content-Length，接着收剩下的内容：

    char resp[10*1024];
    memset(resp, 0, sizeof(resp));
    total = sizeof(resp)-1;
    int received = 0;
    int content_len = 0;
    int body_size = 0;
    do {
        content_len = 0;
        printf("---------------\n");
        printf("start received: %d, total: %d, total - received= %d \n", received, total,
                total-received);
        int bytes = recv(sock,  resp+received, total-received, 0);
        printf("received bytes %d \n", bytes);
        if (bytes < 0) {
            printf("ERROR reading resp from socket");
        }
        if (bytes == 0) {
            printf("received bytes 0, break\n");
            break;
        }
        received += bytes;
        /* printf("\n------- resp ------- \n%s\n", resp); */
        printf("-------- sizeof(resp) %lu\n", sizeof(resp));
        int header_size = 0;
        for(; header_size<sizeof(resp) &&
                (strncmp(resp+header_size, "\r\n\r\n", strlen("\r\n\r\n")) != 0); ++header_size) {
            if(strncmp(resp+header_size, "Content-Length:", strlen("Content-Length:")) == 0) {
                int tmp = header_size + strlen("Content-Length:") + 1;
                while(resp[tmp] != '\r') {
                    content_len = content_len*10 + (resp[tmp] - '0');
                    tmp++;
                }
            }
            printf("%c", resp[header_size]);
        }
        printf("\ncontent_len %d\n", content_len);
        /* printf("\n\n&&&&&&&&&&&&&&&&&&\n"); */
        /* for(int i=header_size; i!=total; ++i) { */
        /*     printf("%c", resp[i]); */
        /* } */
        header_size += strlen("\r\n\r\n");
        body_size = content_len + header_size;
        printf("header size %d\n", header_size);
        printf("\nbody_size %d\n", body_size);
        printf("end received: %d, total: %d, total - received= %d \n", received, total,
                total-received);
    } while(received != body_size && received < total);

至此, 一个简单的 http client 完成.

示例

http://example.com 解析到的 ip 地址为: IP ADDRESS->93.184.216.34

HTTP 请求报文如下

--------HTTP Request--------
GET / HTTP/1.1
HOST: example.com
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36
Cache-Control: no-cache

服务器 response 回复的头部为：

HTTP/1.1 200 OK
Cache-Control: max-age=604800
Content-Type: text/html
Date: Thu, 21 Jun 2018 10:35:29 GMT
Etag: "1541025663+ident"
Expires: Thu, 28 Jun 2018 10:35:29 GMT
Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
Server: ECS (oxr/8313)
Vary: Accept-Encoding
X-Cache: HIT
Content-Length: 1270

接着的内容是：

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 50px;
        background-color: #fff;
        border-radius: 1em;
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        body {
            background-color: #fff;
        }
        div {
            width: auto;
            margin: 0 auto;
            border-radius: 0;
            padding: 1em;
        }
    }
    </style>
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>
    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

以上是网页内容了。