blob: 79030c92f32756c8c4e6db778ebfac88e56114e8 [file] [log] [blame]
Yann Colletd0b7da32024-01-29 15:00:32 -08001/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
Yann Colletd0b7da32024-01-29 15:00:32 -080011/* Implementation notes:
12 *
13 * This is a very simple lorem ipsum generator
14 * which features a static list of words
15 * and print them one after another randomly
16 * with a fake sentence / paragraph structure.
17 *
18 * The goal is to generate a printable text
19 * that can be used to fake a text compression scenario.
20 * The resulting compression / ratio curve of the lorem ipsum generator
21 * is more satisfying than the previous statistical generator,
22 * which was initially designed for entropy compression,
23 * and lacks a regularity more representative of text.
24 *
25 * The compression ratio achievable on the generated lorem ipsum
Yann Collet83598aa2024-02-20 15:24:25 -080026 * is still a bit too good, presumably because the dictionary is a bit too
27 * small. It would be possible to create some more complex scheme, notably by
28 * enlarging the dictionary with a word generator, and adding grammatical rules
29 * (composition) and syntax rules. But that's probably overkill for the intended
30 * goal.
Yann Colletd0b7da32024-01-29 15:00:32 -080031 */
32
33#include "lorem.h"
Yann Colletd0b7da32024-01-29 15:00:32 -080034#include <assert.h>
Yann Collet1e046ce2024-02-20 00:12:32 -080035#include <limits.h> /* INT_MAX */
36#include <string.h> /* memcpy */
Yann Colletd0b7da32024-01-29 15:00:32 -080037
38#define WORD_MAX_SIZE 20
39
40/* Define the word pool */
Yann Collet3dbd8612024-02-20 12:26:37 -080041static const char* kWords[] = {
Yann Collet1e046ce2024-02-20 00:12:32 -080042 "lorem", "ipsum", "dolor", "sit", "amet",
43 "consectetur", "adipiscing", "elit", "sed", "do",
44 "eiusmod", "tempor", "incididunt", "ut", "labore",
45 "et", "dolore", "magna", "aliqua", "dis",
46 "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
47 "commodo", "a", "lacus", "arcu", "magnis",
48 "parturient", "montes", "nascetur", "ridiculus", "mus",
49 "mauris", "nulla", "malesuada", "pellentesque", "eget",
50 "gravida", "in", "dictum", "non", "erat",
51 "nam", "voluptat", "maecenas", "blandit", "aliquam",
52 "etiam", "enim", "lobortis", "scelerisque", "fermentum",
53 "dui", "faucibus", "ornare", "at", "elementum",
54 "eu", "facilisis", "odio", "morbi", "quis",
55 "eros", "donec", "ac", "orci", "purus",
56 "turpis", "cursus", "leo", "vel", "porta",
57 "consequat", "interdum", "varius", "vulputate", "aliquet",
58 "pharetra", "nunc", "auctor", "urna", "id",
59 "metus", "viverra", "nibh", "cras", "mi",
60 "unde", "omnis", "iste", "natus", "error",
61 "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
62 "totam", "rem", "aperiam", "eaque", "ipsa",
63 "quae", "ab", "illo", "inventore", "veritatis",
64 "quasi", "architecto", "beatae", "vitae", "dicta",
65 "sunt", "explicabo", "nemo", "ipsam", "quia",
Yann Collet40874d42024-02-20 00:30:29 -080066 "voluptas", "aspernatur", "aut", "odit", "fugit",
67 "consequuntur", "magni", "dolores", "eos", "qui",
68 "ratione", "sequi", "nesciunt", "neque", "porro",
69 "quisquam", "est", "dolorem", "adipisci", "numquam",
70 "eius", "modi", "tempora", "incidunt", "magnam",
71 "quaerat", "ad", "minima", "veniam", "nostrum",
72 "ullam", "corporis", "suscipit", "laboriosam", "nisi",
73 "aliquid", "ex", "ea", "commodi", "consequatur",
74 "autem", "eum", "iure", "voluptate", "esse",
75 "quam", "nihil", "molestiae", "illum", "fugiat",
Yann Collet7003c992024-02-20 13:27:36 -080076 "quo", "pariatur", "vero", "accusamus", "iusto",
77 "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
78 "deleniti", "atque", "corrupti", "quos", "quas",
79 "molestias", "excepturi", "sint", "occaecati", "cupiditate",
80 "provident", "similique", "culpa", "officia", "deserunt",
81 "mollitia", "animi", "laborum", "dolorum", "fuga",
82 "harum", "quidem", "rerum", "facilis", "expedita",
83 "distinctio", "libero", "tempore", "cum", "soluta",
84 "nobis", "eligendi", "optio", "cumque", "impedit",
85 "minus", "quod", "maxime", "placeat", "facere",
86 "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
87 "officiis", "debitis", "saepe", "eveniet", "voluptates",
88 "repudiandae", "recusandae", "itaque", "earum", "hic",
89 "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
90 "maiores", "alias", "perferendis", "doloribus", "asperiores",
91 "repellat", "minim", "nostrud", "exercitation", "ullamco",
92 "laboris", "aliquip", "duis", "aute", "irure",
Yann Collet1e046ce2024-02-20 00:12:32 -080093};
Yann Collet3dbd8612024-02-20 12:26:37 -080094static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
Yann Colletd0b7da32024-01-29 15:00:32 -080095
Yann Collet3dbd8612024-02-20 12:26:37 -080096/* simple 1-dimension distribution, based on word's length, favors small words
Yann Collet1e046ce2024-02-20 00:12:32 -080097 */
Yann Collet7a225c02024-02-20 15:47:09 -080098static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
99static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
Yann Collet3dbd8612024-02-20 12:26:37 -0800100
Yann Collet7003c992024-02-20 13:27:36 -0800101#define DISTRIB_SIZE_MAX 650
Yann Collet3dbd8612024-02-20 12:26:37 -0800102static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
103static unsigned g_distribCount = 0;
104
105static void countFreqs(
106 const char* words[],
107 size_t nbWords,
108 const int* weights,
Yann Collet7a225c02024-02-20 15:47:09 -0800109 size_t nbWeights)
Yann Collet3dbd8612024-02-20 12:26:37 -0800110{
111 unsigned total = 0;
112 size_t w;
113 for (w = 0; w < nbWords; w++) {
Yann Collet7a225c02024-02-20 15:47:09 -0800114 size_t len = strlen(words[w]);
Yann Collet3dbd8612024-02-20 12:26:37 -0800115 int lmax;
116 if (len >= nbWeights)
117 len = nbWeights - 1;
118 lmax = weights[len];
119 total += (unsigned)lmax;
120 }
121 g_distribCount = total;
122 assert(g_distribCount <= DISTRIB_SIZE_MAX);
123}
124
125static void init_word_distrib(
126 const char* words[],
127 size_t nbWords,
128 const int* weights,
Yann Collet7a225c02024-02-20 15:47:09 -0800129 size_t nbWeights)
Yann Collet3dbd8612024-02-20 12:26:37 -0800130{
131 size_t w, d = 0;
132 countFreqs(words, nbWords, weights, nbWeights);
133 for (w = 0; w < nbWords; w++) {
Yann Collet7a225c02024-02-20 15:47:09 -0800134 size_t len = strlen(words[w]);
Yann Collet3dbd8612024-02-20 12:26:37 -0800135 int l, lmax;
136 if (len >= nbWeights)
137 len = nbWeights - 1;
138 lmax = weights[len];
139 for (l = 0; l < lmax; l++) {
140 g_distrib[d++] = (int)w;
141 }
142 }
143}
Yann Colletd0b7da32024-01-29 15:00:32 -0800144
145/* Note: this unit only works when invoked sequentially.
146 * No concurrent access is allowed */
Yann Collet1e046ce2024-02-20 00:12:32 -0800147static char* g_ptr = NULL;
148static size_t g_nbChars = 0;
149static size_t g_maxChars = 10000000;
Yann Colletd0b7da32024-01-29 15:00:32 -0800150static unsigned g_randRoot = 0;
151
152#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
Yann Collet1e046ce2024-02-20 00:12:32 -0800153static unsigned LOREM_rand(unsigned range)
154{
155 static const unsigned prime1 = 2654435761U;
156 static const unsigned prime2 = 2246822519U;
157 unsigned rand32 = g_randRoot;
158 rand32 *= prime1;
159 rand32 ^= prime2;
160 rand32 = RDG_rotl32(rand32, 13);
161 g_randRoot = rand32;
162 return (unsigned)(((unsigned long long)rand32 * range) >> 32);
Yann Colletd0b7da32024-01-29 15:00:32 -0800163}
164
Yann Collet1e046ce2024-02-20 00:12:32 -0800165static void writeLastCharacters(void)
166{
167 size_t lastChars = g_maxChars - g_nbChars;
168 assert(g_maxChars >= g_nbChars);
169 if (lastChars == 0)
170 return;
171 g_ptr[g_nbChars++] = '.';
172 if (lastChars > 2) {
173 memset(g_ptr + g_nbChars, ' ', lastChars - 2);
174 }
175 if (lastChars > 1) {
176 g_ptr[g_maxChars - 1] = '\n';
177 }
178 g_nbChars = g_maxChars;
Yann Colletd0b7da32024-01-29 15:00:32 -0800179}
180
Yann Collet1e046ce2024-02-20 00:12:32 -0800181static void generateWord(const char* word, const char* separator, int upCase)
Yann Colletd0b7da32024-01-29 15:00:32 -0800182{
183 size_t const len = strlen(word) + strlen(separator);
184 if (g_nbChars + len > g_maxChars) {
185 writeLastCharacters();
186 return;
187 }
188 memcpy(g_ptr + g_nbChars, word, strlen(word));
189 if (upCase) {
190 static const char toUp = 'A' - 'a';
Yann Collet1e046ce2024-02-20 00:12:32 -0800191 g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
Yann Colletd0b7da32024-01-29 15:00:32 -0800192 }
193 g_nbChars += strlen(word);
194 memcpy(g_ptr + g_nbChars, separator, strlen(separator));
195 g_nbChars += strlen(separator);
196}
197
Yann Collet1e046ce2024-02-20 00:12:32 -0800198static int about(unsigned target)
199{
200 return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
Yann Colletd0b7da32024-01-29 15:00:32 -0800201}
202
203/* Function to generate a random sentence */
Yann Collet1e046ce2024-02-20 00:12:32 -0800204static void generateSentence(int nbWords)
205{
Yann Collet5a1bb4a2024-02-20 00:37:21 -0800206 int commaPos = about(9);
207 int comma2 = commaPos + about(7);
208 int qmark = (LOREM_rand(11) == 7);
209 const char* endSep = qmark ? "? " : ". ";
Yann Collet1e046ce2024-02-20 00:12:32 -0800210 int i;
211 for (i = 0; i < nbWords; i++) {
Yann Collet3dbd8612024-02-20 12:26:37 -0800212 int const wordID = g_distrib[LOREM_rand(g_distribCount)];
213 const char* const word = kWords[wordID];
Yann Collet1e046ce2024-02-20 00:12:32 -0800214 const char* sep = " ";
215 if (i == commaPos)
216 sep = ", ";
217 if (i == comma2)
218 sep = ", ";
219 if (i == nbWords - 1)
Yann Collet5a1bb4a2024-02-20 00:37:21 -0800220 sep = endSep;
Yann Collet1e046ce2024-02-20 00:12:32 -0800221 generateWord(word, sep, i == 0);
222 }
Yann Colletd0b7da32024-01-29 15:00:32 -0800223}
224
Yann Collet1e046ce2024-02-20 00:12:32 -0800225static void generateParagraph(int nbSentences)
226{
227 int i;
228 for (i = 0; i < nbSentences; i++) {
Yann Collet7003c992024-02-20 13:27:36 -0800229 int wordsPerSentence = about(11);
Yann Collet1e046ce2024-02-20 00:12:32 -0800230 generateSentence(wordsPerSentence);
231 }
232 if (g_nbChars < g_maxChars) {
233 g_ptr[g_nbChars++] = '\n';
234 }
235 if (g_nbChars < g_maxChars) {
236 g_ptr[g_nbChars++] = '\n';
237 }
Yann Colletd0b7da32024-01-29 15:00:32 -0800238}
239
240/* It's "common" for lorem ipsum generators to start with the same first
241 * pre-defined sentence */
Yann Collet1e046ce2024-02-20 00:12:32 -0800242static void generateFirstSentence(void)
243{
244 int i;
245 for (i = 0; i < 18; i++) {
Yann Collet3dbd8612024-02-20 12:26:37 -0800246 const char* word = kWords[i];
Yann Collet1e046ce2024-02-20 00:12:32 -0800247 const char* separator = " ";
248 if (i == 4)
249 separator = ", ";
250 if (i == 7)
251 separator = ", ";
252 generateWord(word, separator, i == 0);
253 }
Yann Collet3dbd8612024-02-20 12:26:37 -0800254 generateWord(kWords[18], ". ", 0);
Yann Colletd0b7da32024-01-29 15:00:32 -0800255}
256
Yann Collet1e046ce2024-02-20 00:12:32 -0800257size_t
258LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
Yann Colletd0b7da32024-01-29 15:00:32 -0800259{
Yann Collet1e046ce2024-02-20 00:12:32 -0800260 g_ptr = (char*)buffer;
261 assert(size < INT_MAX);
262 g_maxChars = size;
263 g_nbChars = 0;
264 g_randRoot = seed;
Yann Collet3dbd8612024-02-20 12:26:37 -0800265 if (g_distribCount == 0) {
266 init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
267 }
268
Yann Collet1e046ce2024-02-20 00:12:32 -0800269 if (first) {
270 generateFirstSentence();
271 }
272 while (g_nbChars < g_maxChars) {
273 int sentencePerParagraph = about(7);
274 generateParagraph(sentencePerParagraph);
275 if (!fill)
276 break; /* only generate one paragraph in not-fill mode */
277 }
278 g_ptr = NULL;
279 return g_nbChars;
Yann Colletd0b7da32024-01-29 15:00:32 -0800280}
281
282void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
283{
Yann Collet1e046ce2024-02-20 00:12:32 -0800284 LOREM_genBlock(buffer, size, seed, 1, 1);
Yann Colletd0b7da32024-01-29 15:00:32 -0800285}