Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 1 | /* vi: set sw=4 ts=4: */ |
| 2 | /* |
| 3 | * shuf: Write a random permutation of the input lines to standard output. |
| 4 | * |
| 5 | * Copyright (C) 2014 by Bartosz Golaszewski <bartekgola@gmail.com> |
| 6 | * |
| 7 | * Licensed under GPLv2 or later, see file LICENSE in this source tree. |
| 8 | */ |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 9 | //config:config SHUF |
Denys Vlasenko | 4eed2c6 | 2017-07-18 22:01:24 +0200 | [diff] [blame] | 10 | //config: bool "shuf (5.4 kb)" |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 11 | //config: default y |
| 12 | //config: help |
Denys Vlasenko | 72089cf | 2017-07-21 09:50:55 +0200 | [diff] [blame] | 13 | //config: Generate random permutations |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 14 | |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 15 | //applet:IF_SHUF(APPLET_NOEXEC(shuf, shuf, BB_DIR_USR_BIN, BB_SUID_DROP, shuf)) |
| 16 | |
Denys Vlasenko | 0c4dbd4 | 2017-09-18 16:28:43 +0200 | [diff] [blame] | 17 | //kbuild:lib-$(CONFIG_SHUF) += shuf.o |
| 18 | |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 19 | //usage:#define shuf_trivial_usage |
Denys Vlasenko | d59f539 | 2021-08-23 15:48:22 +0200 | [diff] [blame] | 20 | //usage: "[-n NUM] [-o FILE] [-z] [FILE | -e [ARG...] | -i L-H]" |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 21 | //usage:#define shuf_full_usage "\n\n" |
Denys Vlasenko | 69f9567 | 2014-03-07 14:41:53 +0100 | [diff] [blame] | 22 | //usage: "Randomly permute lines\n" |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 23 | //usage: "\n -n NUM Output at most NUM lines" |
Denys Vlasenko | 69f9567 | 2014-03-07 14:41:53 +0100 | [diff] [blame] | 24 | //usage: "\n -o FILE Write to FILE, not standard output" |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 25 | //usage: "\n -z NUL terminated output" |
Denys Vlasenko | d59f539 | 2021-08-23 15:48:22 +0200 | [diff] [blame] | 26 | //usage: "\n -e Treat ARGs as lines" |
| 27 | //usage: "\n -i L-H Treat numbers L-H as lines" |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 28 | |
| 29 | #include "libbb.h" |
| 30 | |
| 31 | /* This is a NOEXEC applet. Be very careful! */ |
| 32 | |
| 33 | #define OPT_e (1 << 0) |
| 34 | #define OPT_i (1 << 1) |
| 35 | #define OPT_n (1 << 2) |
| 36 | #define OPT_o (1 << 3) |
| 37 | #define OPT_z (1 << 4) |
| 38 | #define OPT_STR "ei:n:o:z" |
| 39 | |
| 40 | /* |
| 41 | * Use the Fisher-Yates shuffle algorithm on an array of lines. |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 42 | * If the required number of output lines is less than the total |
| 43 | * we can stop shuffling early. |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 44 | */ |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 45 | static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines) |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 46 | { |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 47 | srand(monotonic_us()); |
| 48 | |
Denys Vlasenko | 6a9b3f7 | 2021-09-07 22:51:42 +0200 | [diff] [blame] | 49 | while (outlines != 0) { |
| 50 | char *tmp; |
| 51 | unsigned r = rand(); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 52 | /* RAND_MAX can be as small as 32767 */ |
Denys Vlasenko | 6a9b3f7 | 2021-09-07 22:51:42 +0200 | [diff] [blame] | 53 | if (numlines > RAND_MAX) |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 54 | r ^= rand() << 15; |
Denys Vlasenko | 6a9b3f7 | 2021-09-07 22:51:42 +0200 | [diff] [blame] | 55 | r %= numlines; |
| 56 | //TODO: the above method is seriously non-uniform when numlines is very large. |
| 57 | //For example, with numlines of 0xf0000000, |
| 58 | //values of (r % numlines) in [0, 0x0fffffff] range |
| 59 | //are more likely: e.g. r=1 and r=0xf0000001 both map to 1, |
| 60 | //whereas only one value, r=0xefffffff, maps to 0xefffffff. |
| 61 | numlines--; |
| 62 | tmp = lines[numlines]; |
| 63 | lines[numlines] = lines[r]; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 64 | lines[r] = tmp; |
Denys Vlasenko | 6a9b3f7 | 2021-09-07 22:51:42 +0200 | [diff] [blame] | 65 | outlines--; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 66 | } |
| 67 | } |
| 68 | |
| 69 | int shuf_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
| 70 | int shuf_main(int argc, char **argv) |
| 71 | { |
| 72 | unsigned opts; |
| 73 | char *opt_i_str, *opt_n_str, *opt_o_str; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 74 | char **lines; |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 75 | unsigned long long lo = lo; |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 76 | unsigned numlines, outlines; |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 77 | unsigned i; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 78 | char eol; |
| 79 | |
Denys Vlasenko | 22542ec | 2017-08-08 21:55:02 +0200 | [diff] [blame] | 80 | opts = getopt32(argv, "^" |
| 81 | OPT_STR |
| 82 | "\0" "e--i:i--e"/* mutually exclusive */, |
| 83 | &opt_i_str, &opt_n_str, &opt_o_str |
| 84 | ); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 85 | |
| 86 | argc -= optind; |
| 87 | argv += optind; |
| 88 | |
| 89 | /* Prepare lines for shuffling - either: */ |
| 90 | if (opts & OPT_e) { |
| 91 | /* make lines from command-line arguments */ |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 92 | numlines = argc; |
| 93 | lines = argv; |
| 94 | } else |
| 95 | if (opts & OPT_i) { |
| 96 | /* create a range of numbers */ |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 97 | unsigned long long hi; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 98 | char *dash; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 99 | |
Denys Vlasenko | 60f4843 | 2021-08-23 15:52:34 +0200 | [diff] [blame] | 100 | if (argv[0]) |
| 101 | bb_show_usage(); |
| 102 | |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 103 | dash = strchr(opt_i_str, '-'); |
| 104 | if (!dash) { |
| 105 | bb_error_msg_and_die("bad range '%s'", opt_i_str); |
| 106 | } |
| 107 | *dash = '\0'; |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 108 | lo = xatoull(opt_i_str); |
| 109 | hi = xatoull(dash + 1); |
Maninder Singh | 7db312a | 2015-06-01 10:40:09 +0000 | [diff] [blame] | 110 | *dash = '-'; |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 111 | if (hi < lo) |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 112 | bb_error_msg_and_die("bad range '%s'", opt_i_str); |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 113 | hi -= lo; |
| 114 | if (sizeof(size_t) > sizeof(numlines)) { |
| 115 | if (hi >= UINT_MAX) |
| 116 | bb_error_msg_and_die("bad range '%s'", opt_i_str); |
| 117 | } else { |
| 118 | if (hi >= UINT_MAX / sizeof(lines[0])) |
| 119 | bb_error_msg_and_die("bad range '%s'", opt_i_str); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 120 | } |
| 121 | |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 122 | numlines = hi + 1; |
| 123 | lines = xmalloc((size_t)numlines * sizeof(lines[0])); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 124 | for (i = 0; i < numlines; i++) { |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 125 | lines[i] = (char*)(uintptr_t)i; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 126 | } |
| 127 | } else { |
| 128 | /* default - read lines from stdin or the input file */ |
| 129 | FILE *fp; |
Denys Vlasenko | 49a2e48 | 2021-08-29 14:39:01 +0200 | [diff] [blame] | 130 | const char *fname = "-"; |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 131 | |
Denys Vlasenko | 49a2e48 | 2021-08-29 14:39:01 +0200 | [diff] [blame] | 132 | if (argv[0]) { |
| 133 | if (argv[1]) |
| 134 | bb_show_usage(); |
| 135 | fname = argv[0]; |
| 136 | } |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 137 | |
Denys Vlasenko | 49a2e48 | 2021-08-29 14:39:01 +0200 | [diff] [blame] | 138 | fp = xfopen_stdin(fname); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 139 | lines = NULL; |
| 140 | numlines = 0; |
| 141 | for (;;) { |
| 142 | char *line = xmalloc_fgetline(fp); |
| 143 | if (!line) |
| 144 | break; |
| 145 | lines = xrealloc_vector(lines, 6, numlines); |
| 146 | lines[numlines++] = line; |
| 147 | } |
| 148 | fclose_if_not_stdin(fp); |
| 149 | } |
| 150 | |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 151 | outlines = numlines; |
| 152 | if (opts & OPT_n) { |
| 153 | outlines = xatou(opt_n_str); |
| 154 | if (outlines > numlines) |
| 155 | outlines = numlines; |
| 156 | } |
| 157 | |
| 158 | shuffle_lines(lines, numlines, outlines); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 159 | |
| 160 | if (opts & OPT_o) |
| 161 | xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO); |
| 162 | |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 163 | eol = '\n'; |
| 164 | if (opts & OPT_z) |
| 165 | eol = '\0'; |
| 166 | |
Ron Yorston | 8817e28 | 2021-08-07 09:41:49 +0100 | [diff] [blame] | 167 | for (i = numlines - outlines; i < numlines; i++) { |
Denys Vlasenko | f4ba69d | 2021-09-04 17:00:22 +0200 | [diff] [blame] | 168 | if (opts & OPT_i) |
| 169 | printf("%llu%c", lo + (uintptr_t)lines[i], eol); |
| 170 | else |
Denys Vlasenko | 102f0d0 | 2014-03-07 14:32:39 +0100 | [diff] [blame] | 171 | printf("%s%c", lines[i], eol); |
Denys Vlasenko | 2cdcb10 | 2014-03-05 18:56:20 +0100 | [diff] [blame] | 172 | } |
| 173 | |
| 174 | fflush_stdout_and_exit(EXIT_SUCCESS); |
| 175 | } |