wget.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* wget.c - Simple downloader to get the resource file from a HTTP server
  2. *
  3. * Copyright 2016 Lipi C.H. Lee <lipisoft@gmail.com>
  4. * Copyright 2021 Eric Molitor <eric@molitor.org>
  5. *
  6. * Relevant sources of information
  7. * -------------------------------
  8. * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230
  9. * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1
  10. * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987
  11. *
  12. * Test URLs
  13. * ---------
  14. * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript
  15. * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html
  16. * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html
  17. * TLS 1.0: https://tls-v1-0.badssl.com:1010/
  18. * TLS 1.1: https://tls-v1-1.badssl.com:1011/
  19. * TLS 1.2: https://tls-v1-2.badssl.com:1012/
  20. * TLS 1.3: https://tls13.1d.pw/
  21. * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt
  22. *
  23. *
  24. * todo: Add support for configurable TLS versions
  25. * todo: Add support for ftp
  26. * todo: Add support for Transfer Encoding (gzip|deflate)
  27. * todo: Add support for RFC5987
  28. USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):p(post-data):", TOYFLAG_USR|TOYFLAG_BIN))
  29. config WGET
  30. bool "wget"
  31. default n
  32. help
  33. usage: wget [OPTIONS]... [URL]
  34. --max-redirect maximum redirections allowed
  35. -d, --debug print lots of debugging information
  36. -O, --output-document=FILE specify output filename
  37. -p, --post-data=DATA send data in body of POST request
  38. examples:
  39. wget http://www.example.com
  40. config WGET_LIBTLS
  41. bool "Enable HTTPS support for wget via LibTLS"
  42. default n
  43. depends on WGET && !WGET_OPENSSL
  44. help
  45. Enable HTTPS support for wget by linking to LibTLS.
  46. Supports using libtls, libretls or libtls-bearssl.
  47. config WGET_OPENSSL
  48. bool "Enable HTTPS support for wget via OpenSSL"
  49. default n
  50. depends on WGET && !WGET_LIBTLS
  51. help
  52. Enable HTTPS support for wget by linking to OpenSSL.
  53. */
  54. #define FOR_wget
  55. #include "toys.h"
  56. #if CFG_WGET_LIBTLS
  57. #define WGET_SSL 1
  58. #include <tls.h>
  59. #elif CFG_WGET_OPENSSL
  60. #define WGET_SSL 1
  61. #include <openssl/crypto.h>
  62. #include <openssl/ssl.h>
  63. #include <openssl/err.h>
  64. #else
  65. #define WGET_SSL 0
  66. #endif
  67. #define HTTPS (WGET_SSL && TT.https)
  68. GLOBALS(
  69. char *p, *O;
  70. long max_redirect;
  71. int sock, https;
  72. char *url;
  73. #if CFG_WGET_LIBTLS
  74. struct tls *tls;
  75. #elif CFG_WGET_OPENSSL
  76. struct ssl_ctx_st *ctx;
  77. struct ssl_st *ssl;
  78. #endif
  79. )
  80. // get http info in URL
  81. static void wget_info(char *url, char **host, char **port, char **path)
  82. {
  83. char *ss = url;
  84. // Must start with case insensitive http:// or https://
  85. if (strncmp(url, "http", 4)) url = 0;
  86. else {
  87. url += 4;
  88. if ((TT.https = WGET_SSL && toupper(*url=='s'))) url++;
  89. if (!strstart(&url, "://")) url = 0;
  90. }
  91. if (!url) error_exit("unsupported protocol: %s", ss);
  92. if ((*path = strchr(*host = url, '/'))) *((*path)++) = 0;
  93. else *path = "";
  94. // Get port number and trim literal IPv6 addresses
  95. if (**host=='[' && (ss = strchr(++*host, ']'))) {
  96. *ss++ = 0;
  97. *port = (*ss==':') ? ++ss : 0;
  98. } else if ((*port = strchr(*host, ':'))) *(*port++) = 0;
  99. if (!*port) *port = HTTPS ? "443" : "80";
  100. }
  101. static void wget_connect(char *host, char *port)
  102. {
  103. if (!HTTPS)
  104. TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
  105. else {
  106. #if CFG_WGET_LIBTLS
  107. struct tls_config *cfg = NULL;
  108. uint32_t protocols;
  109. if (!(TT.tls = tls_client()))
  110. error_exit("tls_client: %s", tls_error(TT.tls));
  111. if (!(cfg = tls_config_new()))
  112. error_exit("tls_config_new: %s", tls_config_error(cfg));
  113. if (tls_config_parse_protocols(&protocols, "tlsv1.2"))
  114. error_exit("tls_config_parse_protocols");
  115. if (tls_config_set_protocols(cfg, protocols))
  116. error_exit("tls_config_set_protocols: %s", tls_config_error(cfg));
  117. if (tls_configure(TT.tls, cfg))
  118. error_exit("tls_configure: %s", tls_error(TT.tls));
  119. tls_config_free(cfg);
  120. if (tls_connect(TT.tls, host, port))
  121. error_exit("tls_connect: %s", tls_error(TT.tls));
  122. #elif CFG_WGET_OPENSSL
  123. SSL_library_init();
  124. OpenSSL_add_all_algorithms();
  125. SSL_load_error_strings();
  126. ERR_load_crypto_strings();
  127. TT.ctx = SSL_CTX_new(TLS_client_method());
  128. if (!TT.ctx) error_exit("SSL_CTX_new");
  129. TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
  130. TT.ssl = SSL_new(TT.ctx);
  131. if (!TT.ssl)
  132. error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
  133. if (!SSL_set_tlsext_host_name(TT.ssl, host))
  134. error_exit("SSL_set_tlsext_host_name: %s",
  135. ERR_error_string(ERR_get_error(), NULL));
  136. SSL_set_fd(TT.ssl, TT.sock);
  137. if (SSL_connect(TT.ssl) == -1)
  138. error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
  139. if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl));
  140. #endif
  141. }
  142. }
  143. static size_t wget_read(void *buf, size_t len)
  144. {
  145. if (!HTTPS) return xread(TT.sock, buf, len);
  146. else {
  147. char *err = 0;
  148. int ret;
  149. #if CFG_WGET_LIBTLS
  150. if ((ret = tls_read(TT.tls, buf, len))<0) err = tls_error(TT.tls);
  151. #elif CFG_WGET_OPENSSL
  152. if ((ret = SSL_read(TT.ssl, buf, len))<0)
  153. err = ERR_error_string(ERR_get_error(), 0);
  154. #endif
  155. if (err) error_exit("https read: %s", err);
  156. return ret;
  157. }
  158. }
  159. static void wget_write(void *buf, size_t len)
  160. {
  161. if (!HTTPS) xwrite(TT.sock, buf, len);
  162. else {
  163. char *err = 0;
  164. #if CFG_WGET_LIBTLS
  165. if (len != tls_write(TT.tls, buf, len)) err = tls_error(TT.tls);
  166. #elif CFG_WGET_OPENSSL
  167. if (len != SSL_write(TT.ssl, buf, len))
  168. err = ERR_error_string(ERR_get_error(), 0);
  169. #endif
  170. if (err) error_exit("https write: %s", err);
  171. }
  172. }
  173. static void wget_close()
  174. {
  175. if (TT.sock) {
  176. xclose(TT.sock);
  177. TT.sock = 0;
  178. }
  179. #if CFG_WGET_LIBTLS
  180. if (TT.tls) {
  181. tls_close(TT.tls);
  182. tls_free(TT.tls);
  183. TT.tls = 0;
  184. }
  185. #elif CFG_WGET_OPENSSL
  186. if (TT.ssl) {
  187. SSL_shutdown(TT.ssl);
  188. SSL_free(TT.ssl);
  189. TT.ssl = 0;
  190. }
  191. if (TT.ctx) {
  192. SSL_CTX_free(TT.ctx);
  193. TT.ctx = 0;
  194. }
  195. #endif
  196. }
  197. static char *wget_find_header(char *header, char *val)
  198. {
  199. char *result = strcasestr(header, val);
  200. if (result) {
  201. result += strlen(val);
  202. result[strcspn(result, "\r\n")] = 0;
  203. }
  204. return result;
  205. }
  206. void wget_main(void)
  207. {
  208. long status = 0;
  209. size_t len, c_len = 0;
  210. int fd = 0;
  211. char *body, *index, *host, *port, *path, *chunked, *ss;
  212. char agent[] = "toybox wget/" TOYBOX_VERSION;
  213. TT.url = xstrdup(*toys.optargs);
  214. // Ask server for URL, following redirects until success
  215. while (status != 200) {
  216. if (!TT.max_redirect--) error_exit("Too many redirects");
  217. // Connect and write request
  218. wget_info(TT.url, &host, &port, &path);
  219. if (TT.p) sprintf(toybuf, "Content-Length: %ld\r\n", strlen(TT.p));
  220. ss = xmprintf("%s /%s HTTP/1.1\r\nHost: %s\r\nUser-Agent: %s\r\n"
  221. "Connection: close\r\n%s\r\n%s", FLAG(p) ? "POST" : "GET",
  222. path, host, agent, FLAG(p) ? toybuf : "", FLAG(p)?TT.p:"");
  223. if (FLAG(d)) printf("--- Request\n%s", ss);
  224. wget_connect(host, port);
  225. wget_write(ss, strlen(ss));
  226. free(ss);
  227. // Read HTTP response into toybuf (probably with some body at end)
  228. for (index = toybuf;
  229. (len = wget_read(index, sizeof(toybuf)-(index-toybuf)))>0; index += len);
  230. // Split response into header and body, and null terminate header.
  231. // (RFC7230 says header cannot contain NUL.)
  232. if (!(body = memmem(ss = toybuf, index-toybuf, "\r\n\r\n", 4)))
  233. error_exit("response header too large");
  234. *body = 0;
  235. body += 4;
  236. len = index-body;
  237. if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf);
  238. status = strstart(&ss, "HTTP/1.1 ") ? strtol(ss, 0, 10) : 0;
  239. if ((status == 301) || (status == 302)) {
  240. if (!(ss = wget_find_header(toybuf, "Location: ")))
  241. error_exit("bad redirect");
  242. free(TT.url);
  243. TT.url = xstrdup(ss);
  244. wget_close();
  245. } else if (status != 200) error_exit("response: %ld", status);
  246. }
  247. // Open output file
  248. if (TT.O && !strcmp(TT.O, "-")) fd = 1;
  249. else if (!TT.O) {
  250. ss = wget_find_header(toybuf, "Content-Disposition: attachment; filename=");
  251. if (!ss && strchr(path, '/')) ss = getbasename(path);
  252. if (!ss || !*ss ) ss = "index.html";
  253. if (!access((TT.O = ss), F_OK)) error_exit("%s already exists", TT.O);
  254. }
  255. // TODO: don't allow header/basename to write to stdout
  256. if (!fd) fd = xcreate(TT.O, (O_WRONLY|O_CREAT|O_TRUNC), 0644);
  257. // If chunked we offset the first buffer by 2 character, meaning it is
  258. // pointing at half of the header boundary, aka '\r\n'. This simplifies
  259. // parsing of the first c_len length by allowing the do while loop to fall
  260. // through on the first iteration and parse the first c_len size.
  261. chunked = wget_find_header(toybuf, "transfer-encoding: chunked");
  262. if (chunked) memmove(toybuf, body-2, len += 2);
  263. else memmove(toybuf, body, len);
  264. // len is the size remaining in toybuf
  265. // c_len is the size of the remaining bytes in the current chunk
  266. do {
  267. if (chunked) {
  268. if (c_len > 0) { // We have an incomplete c_len to write
  269. if (len <= c_len) { // Buffer is less than the c_len so full write
  270. xwrite(fd, toybuf, len);
  271. c_len = c_len - len;
  272. len = 0;
  273. } else { // Buffer is larger than the c_len so partial write
  274. xwrite(fd, toybuf, c_len);
  275. len = len - c_len;
  276. memmove(toybuf, toybuf + c_len, len);
  277. c_len = 0;
  278. }
  279. }
  280. // If len is less than 2 we can't validate the chunk boundary so fall
  281. // through and go read more into toybuf.
  282. if (!c_len && (len > 2)) {
  283. char *c;
  284. if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary");
  285. // If we can't find the end of the new chunk signature fall through and
  286. // read more into toybuf.
  287. c = memmem(toybuf + 2, len - 2, "\r\n",2);
  288. if (c) {
  289. c_len = strtol(toybuf + 2, NULL, 16);
  290. if (!c_len) break; // A c_len of zero means we are complete
  291. len = len - (c - toybuf) - 2;
  292. memmove(toybuf, c + 2, len);
  293. }
  294. }
  295. if (len == sizeof(toybuf)) error_exit("chunk overflow");
  296. } else {
  297. xwrite(fd, toybuf, len);
  298. len = 0;
  299. }
  300. } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0);
  301. wget_close();
  302. free(TT.url);
  303. }