blob: 3def3d80f0a1f2ec0e1ab57f8b7b7197330e78d6 [file] [log] [blame]
Denys Vlasenko2cdcb102014-03-05 18:56:20 +01001/* vi: set sw=4 ts=4: */
2/*
3 * shuf: Write a random permutation of the input lines to standard output.
4 *
5 * Copyright (C) 2014 by Bartosz Golaszewski <bartekgola@gmail.com>
6 *
7 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8 */
Denys Vlasenko2cdcb102014-03-05 18:56:20 +01009//config:config SHUF
Denys Vlasenko4eed2c62017-07-18 22:01:24 +020010//config: bool "shuf (5.4 kb)"
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010011//config: default y
12//config: help
Denys Vlasenko72089cf2017-07-21 09:50:55 +020013//config: Generate random permutations
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010014
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010015//applet:IF_SHUF(APPLET_NOEXEC(shuf, shuf, BB_DIR_USR_BIN, BB_SUID_DROP, shuf))
16
Denys Vlasenko0c4dbd42017-09-18 16:28:43 +020017//kbuild:lib-$(CONFIG_SHUF) += shuf.o
18
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010019//usage:#define shuf_trivial_usage
Denys Vlasenkod59f5392021-08-23 15:48:22 +020020//usage: "[-n NUM] [-o FILE] [-z] [FILE | -e [ARG...] | -i L-H]"
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010021//usage:#define shuf_full_usage "\n\n"
Denys Vlasenko69f95672014-03-07 14:41:53 +010022//usage: "Randomly permute lines\n"
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010023//usage: "\n -n NUM Output at most NUM lines"
Denys Vlasenko69f95672014-03-07 14:41:53 +010024//usage: "\n -o FILE Write to FILE, not standard output"
Ron Yorston8817e282021-08-07 09:41:49 +010025//usage: "\n -z NUL terminated output"
Denys Vlasenkod59f5392021-08-23 15:48:22 +020026//usage: "\n -e Treat ARGs as lines"
27//usage: "\n -i L-H Treat numbers L-H as lines"
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010028
29#include "libbb.h"
30
31/* This is a NOEXEC applet. Be very careful! */
32
33#define OPT_e (1 << 0)
34#define OPT_i (1 << 1)
35#define OPT_n (1 << 2)
36#define OPT_o (1 << 3)
37#define OPT_z (1 << 4)
38#define OPT_STR "ei:n:o:z"
39
40/*
41 * Use the Fisher-Yates shuffle algorithm on an array of lines.
Ron Yorston8817e282021-08-07 09:41:49 +010042 * If the required number of output lines is less than the total
43 * we can stop shuffling early.
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010044 */
Ron Yorston8817e282021-08-07 09:41:49 +010045static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines)
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010046{
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010047 srand(monotonic_us());
48
Denys Vlasenko6a9b3f72021-09-07 22:51:42 +020049 while (outlines != 0) {
50 char *tmp;
51 unsigned r = rand();
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010052 /* RAND_MAX can be as small as 32767 */
Denys Vlasenko6a9b3f72021-09-07 22:51:42 +020053 if (numlines > RAND_MAX)
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010054 r ^= rand() << 15;
Denys Vlasenko6a9b3f72021-09-07 22:51:42 +020055 r %= numlines;
56//TODO: the above method is seriously non-uniform when numlines is very large.
57//For example, with numlines of 0xf0000000,
58//values of (r % numlines) in [0, 0x0fffffff] range
59//are more likely: e.g. r=1 and r=0xf0000001 both map to 1,
60//whereas only one value, r=0xefffffff, maps to 0xefffffff.
61 numlines--;
62 tmp = lines[numlines];
63 lines[numlines] = lines[r];
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010064 lines[r] = tmp;
Denys Vlasenko6a9b3f72021-09-07 22:51:42 +020065 outlines--;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010066 }
67}
68
69int shuf_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
70int shuf_main(int argc, char **argv)
71{
72 unsigned opts;
73 char *opt_i_str, *opt_n_str, *opt_o_str;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010074 char **lines;
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +020075 unsigned long long lo = lo;
Ron Yorston8817e282021-08-07 09:41:49 +010076 unsigned numlines, outlines;
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +020077 unsigned i;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010078 char eol;
79
Denys Vlasenko22542ec2017-08-08 21:55:02 +020080 opts = getopt32(argv, "^"
81 OPT_STR
82 "\0" "e--i:i--e"/* mutually exclusive */,
83 &opt_i_str, &opt_n_str, &opt_o_str
84 );
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010085
86 argc -= optind;
87 argv += optind;
88
89 /* Prepare lines for shuffling - either: */
90 if (opts & OPT_e) {
91 /* make lines from command-line arguments */
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010092 numlines = argc;
93 lines = argv;
94 } else
95 if (opts & OPT_i) {
96 /* create a range of numbers */
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +020097 unsigned long long hi;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010098 char *dash;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +010099
Denys Vlasenko60f48432021-08-23 15:52:34 +0200100 if (argv[0])
101 bb_show_usage();
102
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100103 dash = strchr(opt_i_str, '-');
104 if (!dash) {
105 bb_error_msg_and_die("bad range '%s'", opt_i_str);
106 }
107 *dash = '\0';
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200108 lo = xatoull(opt_i_str);
109 hi = xatoull(dash + 1);
Maninder Singh7db312a2015-06-01 10:40:09 +0000110 *dash = '-';
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200111 if (hi < lo)
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100112 bb_error_msg_and_die("bad range '%s'", opt_i_str);
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200113 hi -= lo;
114 if (sizeof(size_t) > sizeof(numlines)) {
115 if (hi >= UINT_MAX)
116 bb_error_msg_and_die("bad range '%s'", opt_i_str);
117 } else {
118 if (hi >= UINT_MAX / sizeof(lines[0]))
119 bb_error_msg_and_die("bad range '%s'", opt_i_str);
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100120 }
121
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200122 numlines = hi + 1;
123 lines = xmalloc((size_t)numlines * sizeof(lines[0]));
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100124 for (i = 0; i < numlines; i++) {
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200125 lines[i] = (char*)(uintptr_t)i;
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100126 }
127 } else {
128 /* default - read lines from stdin or the input file */
129 FILE *fp;
Denys Vlasenko49a2e482021-08-29 14:39:01 +0200130 const char *fname = "-";
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100131
Denys Vlasenko49a2e482021-08-29 14:39:01 +0200132 if (argv[0]) {
133 if (argv[1])
134 bb_show_usage();
135 fname = argv[0];
136 }
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100137
Denys Vlasenko49a2e482021-08-29 14:39:01 +0200138 fp = xfopen_stdin(fname);
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100139 lines = NULL;
140 numlines = 0;
141 for (;;) {
142 char *line = xmalloc_fgetline(fp);
143 if (!line)
144 break;
145 lines = xrealloc_vector(lines, 6, numlines);
146 lines[numlines++] = line;
147 }
148 fclose_if_not_stdin(fp);
149 }
150
Ron Yorston8817e282021-08-07 09:41:49 +0100151 outlines = numlines;
152 if (opts & OPT_n) {
153 outlines = xatou(opt_n_str);
154 if (outlines > numlines)
155 outlines = numlines;
156 }
157
158 shuffle_lines(lines, numlines, outlines);
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100159
160 if (opts & OPT_o)
161 xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO);
162
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100163 eol = '\n';
164 if (opts & OPT_z)
165 eol = '\0';
166
Ron Yorston8817e282021-08-07 09:41:49 +0100167 for (i = numlines - outlines; i < numlines; i++) {
Denys Vlasenkof4ba69d2021-09-04 17:00:22 +0200168 if (opts & OPT_i)
169 printf("%llu%c", lo + (uintptr_t)lines[i], eol);
170 else
Denys Vlasenko102f0d02014-03-07 14:32:39 +0100171 printf("%s%c", lines[i], eol);
Denys Vlasenko2cdcb102014-03-05 18:56:20 +0100172 }
173
174 fflush_stdout_and_exit(EXIT_SUCCESS);
175}