cut.c 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /* cut.c - print selected ranges from a file
  2. *
  3. * Copyright 2016 Rob Landley <rob@landley.net>
  4. *
  5. * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
  6. *
  7. * Deviations from posix: added -DF. We can only accept 512 selections, and
  8. * "-" counts as start to end. Using spaces to separate a comma-separated list
  9. * is silly and inconsistent with dd, ps, cp, and mount.
  10. *
  11. * todo: -n, -s with -c
  12. USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbfF]", TOYFLAG_USR|TOYFLAG_BIN))
  13. config CUT
  14. bool "cut"
  15. default y
  16. help
  17. usage: cut [-Ds] [-bcCfF LIST] [-dO DELIM] [FILE...]
  18. Print selected parts of lines from each FILE to standard output.
  19. Each selection LIST is comma separated, either numbers (counting from 1)
  20. or dash separated ranges (inclusive, with X- meaning to end of line and -X
  21. from start). By default selection ranges are sorted and collated, use -D
  22. to prevent that.
  23. -b Select bytes
  24. -c Select UTF-8 characters
  25. -C Select unicode columns
  26. -d Use DELIM (default is TAB for -f, run of whitespace for -F)
  27. -D Don't sort/collate selections or match -fF lines without delimiter
  28. -f Select fields (words) separated by single DELIM character
  29. -F Select fields separated by DELIM regex
  30. -O Output delimiter (default one space for -F, input delim for -f)
  31. -s Skip lines without delimiters
  32. */
  33. #define FOR_cut
  34. #include "toys.h"
  35. GLOBALS(
  36. char *d, *O;
  37. struct arg_list *select[5]; // we treat them the same, so loop through
  38. unsigned line;
  39. int pairs;
  40. regex_t reg;
  41. )
  42. // Apply selections to an input line, producing output
  43. static void cut_line(char **pline, long len)
  44. {
  45. unsigned *pairs = (void *)toybuf, wc;
  46. char *line;
  47. int i, j, k;
  48. if (!pline) return;
  49. line = *pline;
  50. if (len && line[len-1]=='\n') line[--len] = 0;
  51. TT.line++;
  52. // Loop through selections
  53. for (i=0; i<TT.pairs; i++) {
  54. unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
  55. char *s = line, *ss, *sss;
  56. // when the delimiter is \n output lines.
  57. if (*TT.d == '\n') {
  58. if (TT.line<start || TT.line>end) {
  59. if (i+1 == TT.pairs) return;
  60. continue;
  61. }
  62. goto write_line;
  63. }
  64. // input: start/end position, count=difference between them
  65. // output: s = start of string, len = bytes to output
  66. if (start) start--;
  67. if (start>=len) continue;
  68. if (!end || end>len) end = len;
  69. count = end-start;
  70. // Find start and end of output string for the relevant selection type
  71. if (FLAG(b)) {
  72. if (!FLAG(n)) s += start;
  73. else {
  74. if (end>len) end = len;
  75. for (sss = ss = s; (k = (ss-line))<end;) {
  76. if (0>(j = utf8towc(&wc, ss, len))) ss++;
  77. else {
  78. if (((ss += j)-line)<=end) sss = ss;
  79. if ((ss-line)<=start) s = ss;
  80. }
  81. }
  82. if (!(count = sss-s)) continue;
  83. }
  84. } else if (FLAG(C)) {
  85. // crunch_str() currently assumes that combining characters get
  86. // escaped, to provide an unambiguous visual representation.
  87. // This assumes the input string is null terminated.
  88. if (start) crunch_str(&s, start, 0, 0, 0);
  89. if (!*s) continue;
  90. start = s-line;
  91. ss = s;
  92. crunch_str(&ss, count, 0, 0, 0);
  93. count = ss-s;
  94. } else if (FLAG(c)) {
  95. // Find start
  96. ss = line+len;
  97. while (start && s<ss) {
  98. if (0<=(j = utf8towc(&wc, s, len))) start--;
  99. s += (j<1) ? 1 : j;
  100. }
  101. if (s == ss) continue;
  102. // Find end
  103. end = count;
  104. sss = s;
  105. while (end && sss<ss) {
  106. if (0<=(j = utf8towc(&wc, sss, len))) end--;
  107. sss += (j<1) ? 1 : j;
  108. }
  109. count = sss-s;
  110. } else {
  111. regmatch_t match;
  112. // Loop through skipping appropriate number of fields
  113. for (j = 0; j<2; j++) {
  114. ss = s;
  115. if (j) start = count;
  116. else end = start;
  117. while (*ss && start) {
  118. if (FLAG(f)) {
  119. if (!strchr(TT.d, *ss++)) continue;
  120. if (!--start && j) ss--;
  121. } else {
  122. if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
  123. ss = line+len;
  124. continue;
  125. }
  126. if (!match.rm_eo) break; // zero length match == no delimiter
  127. ss += (!--start && j) ? match.rm_so : match.rm_eo;
  128. }
  129. }
  130. if (!j && !*(s = ss)) break;
  131. }
  132. // If we never encountered even one separator, print whole line (posix!)
  133. if (!j && end == start) {
  134. if (FLAG(D)) break;
  135. if (FLAG(s)) return;
  136. write_line:
  137. fwrite(line, len, 1, stdout);
  138. break;
  139. } else if (!*s) continue;
  140. count = ss-s;
  141. }
  142. if (i && TT.O) fputs(TT.O, stdout);
  143. fwrite(s, count, 1, stdout);
  144. }
  145. xputc('\n');
  146. }
  147. static int compar(unsigned *a, unsigned *b)
  148. {
  149. if (*a<*b) return -1;
  150. if (*a>*b) return 1;
  151. if (a[1]<b[1]) return -1;
  152. if (a[1]>b[1]) return 1;
  153. return 0;
  154. }
  155. // parse A or A-B or A- or -B
  156. static char *get_range(void *data, char *str, int len)
  157. {
  158. char *end = str;
  159. unsigned *pairs = (void *)toybuf, i;
  160. // Using toybuf[] to store ranges means we can have 512 selections max.
  161. if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
  162. pairs += 2*TT.pairs++;
  163. pairs[1] = UINT_MAX;
  164. for (i = 0; ;i++) {
  165. if (i==2) return end;
  166. if (isdigit(*end)) {
  167. long long ll = estrtol(end, &end, 10);
  168. if (ll<1 || ll>UINT_MAX || errno) return end;
  169. pairs[i] = ll;
  170. }
  171. if (*end++ != '-') break;
  172. }
  173. if (!i) pairs[1] = pairs[0];
  174. if ((end-str)<len) return end;
  175. if (pairs[0]>pairs[1]) return str;
  176. // No error
  177. return 0;
  178. }
  179. void cut_main(void)
  180. {
  181. int i;
  182. char buf[8];
  183. // Parse command line arguments
  184. if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
  185. error_exit("-s needs -Ff");
  186. if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
  187. error_exit("-d needs -Ff");
  188. if (!TT.d) TT.d = (FLAG(F)) ? "[[:space:]][[:space:]]*" : "\t";
  189. if (FLAG(F)) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
  190. if (!TT.O) {
  191. if (FLAG(F)) TT.O = " ";
  192. else if (FLAG(f)) TT.O = TT.d;
  193. }
  194. // Parse ranges, which are attached to a selection type (only one can be set)
  195. for (i = 0; i<ARRAY_LEN(TT.select); i++) {
  196. sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
  197. if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
  198. }
  199. if (!TT.pairs) error_exit("no selections");
  200. // Sort and collate selections
  201. if (!FLAG(D)) {
  202. int from, to;
  203. unsigned *pairs = (void *)toybuf;
  204. qsort(toybuf, TT.pairs, 8, (void *)compar);
  205. for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
  206. if (pairs[from] > pairs[to+1]) {
  207. to += 2;
  208. memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
  209. } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
  210. }
  211. TT.pairs = (to/2)+1;
  212. }
  213. // For each argument, loop through lines of file and call cut_line() on each
  214. loopfiles_lines(toys.optargs, cut_line);
  215. }