summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Fankhauser (@tux-six)2022-05-30 16:13:08 +0200
committerAndreas Fankhauser (@tux-six)2022-05-30 16:13:08 +0200
commit3f8ac6551aa0f53fd14b216e97e79939635b8b82 (patch)
treed71d6b71cd5781692ca95fe938de630bd286e647
parent4d54d900cd2cfb880b5cada3b9baef6a30a04332 (diff)
downloadbulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.zip
bulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.tar.gz
Add from existing sources after some cleanup.
-rw-r--r--.gitignore1
-rw-r--r--Makefile81
-rw-r--r--README.md8
-rw-r--r--src/bulk_ln/bulk_ln.c422
-rw-r--r--src/bulk_ln/bulk_ln.h10
-rw-r--r--src/bulk_ln/bulk_ln_main.c10
-rw-r--r--src/common/commonbase.h23
7 files changed, 555 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..796b96d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f3abfb7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,81 @@
+
+CC=gcc
+LD=ld
+AR=ar
+TAR=tar
+BINEXT=
+TOOLS=lxGcc64
+
+ifndef PROJECT_VERSION
+ # We just provide a primitive version string so we can see which build
+ # we're using while debugging. For a release we will override the version
+ # by providing it from cli args to 'make' as in:
+ # make clean package PROJECT_VERSION=1.2.3
+ PROJECT_VERSION=$(shell date +0.0.0-%Y%m%d.%H%M%S)
+endif
+
+CFLAGS= --std=c99 \
+ -Wall -Wextra -Werror -fmax-errors=3 \
+ -Wno-error=unused-function -Wno-error=unused-label \
+ -Wno-error=unused-variable -Wno-error=unused-parameter \
+ -Wno-error=unused-const-variable \
+ -Werror=implicit-fallthrough=1 \
+ -Wno-error=unused-but-set-variable \
+ -Wno-unused-function -Wno-unused-parameter \
+ -DPROJECT_VERSION=$(PROJECT_VERSION)
+
+LDFLAGS= -Wl,--no-demangle,--fatal-warnings
+
+INCDIRS= -Isrc/bulk_ln -Isrc/common
+
+ifndef NDEBUG
+ CFLAGS := $(CFLAGS) -ggdb -O0 -g3
+else
+ CFLAGS := $(CFLAGS) -ffunction-sections -fdata-sections -Os "-DNDEBUG=1"
+ LDFLAGS := $(LDFLAGS) -Wl,--gc-sections,--as-needed
+endif
+
+
+default: link package
+
+.PHONY: clean
+clean:
+ @echo "\n[INFO ] Clean"
+ rm -rf build dist
+
+.PHONY: link
+link: build/bin/bulk-ln$(BINEXT)
+
+build/obj/%.o: src/%.c
+ @echo "\n[INFO ] Compile '$@'"
+ @mkdir -p $(shell dirname build/obj/$*)
+ $(CC) -c -o $@ $< $(CFLAGS) $(INCDIRS)
+
+build/bin/bulk-ln$(BINEXT): \
+ build/obj/bulk_ln/bulk_ln.o \
+ build/obj/bulk_ln/bulk_ln_main.o
+ @echo "\n[INFO ] Link '$@'"
+ @mkdir -p $(shell dirname $@)
+ $(CC) -o $@ $(LDFLAGS) $^ $(LIBSDIR)
+
+.PHONY: package
+package: link
+ @echo "\n[INFO ] Package"
+ @rm -rf build/dist-* dist
+ @mkdir dist
+ @echo
+ @bash -c 'if [[ -n `git status --porcelain` ]]; then echo "[ERROR] Worktree not clean as it should be (see: git status)"; exit 1; fi'
+ @# Create Executable bundle.
+ @rm -rf build/dist-bin && mkdir -p build/dist-bin
+ @cp -t build/dist-bin \
+ README*
+ @mkdir build/dist-bin/bin
+ @cp -t build/dist-bin/bin \
+ build/bin/*$(BINEXT)
+ @(cd build/dist-bin && find . -type f -not -name MD5SUM -exec md5sum -b {} \;) > build/MD5SUM
+ @mv build/MD5SUM build/dist-bin/.
+ @(cd build/dist-bin && $(TAR) --owner=0 --group=0 -czf ../../dist/BulkLn-$(PROJECT_VERSION)-$(TOOLS).tgz *)
+ @echo "\n[INFO ] DONE: Artifacts created and placed in 'dist'."
+ @echo
+ @echo See './dist/' for result.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..57904bd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+
+BulkLn
+======
+
+'ln' like tool to handle large amount of links.
+
+Usage: bulk-ln --help
+
diff --git a/src/bulk_ln/bulk_ln.c b/src/bulk_ln/bulk_ln.c
new file mode 100644
index 0000000..7c7e98a
--- /dev/null
+++ b/src/bulk_ln/bulk_ln.c
@@ -0,0 +1,422 @@
+
+/* Header for this file */
+#include "bulk_ln.h"
+
+/* System */
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h> // TODO remove
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* Other packages from this project */
+/*#include ""*/
+
+
+#define DATA_FILE_FIELD_SEP_CHR '\t'
+
+
+typedef struct BulkLn BulkLn;
+
+
+/* Root context */
+struct BulkLn {
+
+ /** File (path) where we read the paths-to-link from. The special
+ * value - (dash) means to use the stdin stream. */
+ char *dataFilePath;
+
+ FILE *dataFd;
+
+ /** If a dry-run got requested */
+ bool dryRun;
+
+ /** If true, we will print to stdout for every link we create (like we do
+ * in dry run) */
+ bool isPrintEachCreateLink;
+
+ /** print every call we make to mkdir to stdout. Usually only useful for
+ * debugging */
+ bool isPrintEachMkdir;
+
+ /** if true, we periodically print some kind of progress to stderr */
+ bool isPrintStatus;
+
+ /** if true, we print a summary what we did at the end of the run to stderr */
+ bool isPrintSummary;
+
+ /** if true, we will override existing files. This will be on for example
+ * if --force got specified. */
+ bool isRelinkExistingFiles;
+
+ /** Count of links we created */
+ int createdLinksCount;
+
+ /** Count of direcrories we created */
+ int createdDirsCount;
+};
+
+
+
+void printHelp(){
+ printf("\n %s%s\n", strrchr(__FILE__, '/') + 1, " @ " STR_QUOT(PROJECT_VERSION) "\n"
+ "\n"
+ "Utility to create links. Writing a custom implementation of 'ln'\n"
+ "got necessary as we found no way to instruct 'ln' to create a few thousand\n"
+ "links in an acceptable amount of time. So we just wrote our own ;)\n"
+ "\n"
+ "Takes paths (pairwise) from stdin (see --stdin for details) and creates a\n"
+ "hardlink for each pair from the 1st path to the 2nd.\n"
+ "\n"
+ "Options:\n"
+ "\n"
+ " --stdin\n"
+ " Read the path pairs to link from stdin. The format is like:\n"
+ "\n"
+ " <src-path> <tab> <dst-path> <newline>\n"
+ "\n"
+ " Example:\n"
+ "\n"
+ " origin/foo.txt\tnew/gugg.txt\n"
+ " origin/bar.txt\tnew/da.txt\n"
+ "\n"
+ " HINT: Preferred <newline> is LF. But CRLF should work too.\n"
+ "\n"
+ " --quiet\n"
+ " Don't print status or similar stuff. Errors will still be printed to\n"
+ " stderr.\n"
+ "\n"
+ " --verbose\n"
+ " Print stupid amount of logs. Usually only helpful for debugging. Should\n"
+ " NOT be combined with --quiet as this would be nonsense anyway.\n"
+ "\n"
+ " --dry-run\n"
+ " Will print the actions to stdout instead executing them.\n"
+ " HINT: The directory count in the summary will be implicitly set to zero,\n"
+ " as our used counting strategy would deliver wrong results when we not\n"
+ " actually creating the dirs.\n"
+ "\n"
+ " --force\n"
+ " Same meaning as in original 'ln' command.\n"
+ "\n");
+}
+
+
+/** returns non-zero on errors. Error messages will already be printed
+ * internally. */
+int parseArgs( int argc, char**argv, BulkLn*bulkLn ){
+ /* init (aka set defaults) */
+ bulkLn->dataFilePath = NULL;
+ bulkLn->dryRun = 0;
+ bulkLn->isPrintEachCreateLink = 0;
+ bulkLn->isPrintEachMkdir = 0;
+ bulkLn->isPrintStatus = !0;
+ bulkLn->isPrintSummary = !0;
+ bulkLn->isRelinkExistingFiles = 0;
+
+ // Parse args
+ for( int i=1 ; i<argc ; ++i ) {
+ char *arg = argv[i];
+ if( !strcmp(arg, "--help") ){
+ printHelp();
+ return -1;
+ }
+ else if( !strcmp(arg, "--dry-run") ){
+ bulkLn->dryRun = !0;
+ }
+ else if( !strcmp(arg, "--force") ){
+ bulkLn->isRelinkExistingFiles = !0;
+ }
+ else if( !strcmp(arg, "--quiet") ){
+ bulkLn->isPrintStatus = false;
+ bulkLn->isPrintSummary = false;
+ }
+ else if( !strcmp(arg, "--stdin") ){
+ bulkLn->dataFilePath = "-";
+ }
+ else if( !strcmp(arg, "--verbose") ){
+ bulkLn->isPrintStatus = !0;
+ bulkLn->isPrintSummary = !0;
+ bulkLn->isPrintEachCreateLink = !0;
+ bulkLn->isPrintEachMkdir = !0;
+ }
+ else{
+ fprintf(stderr, "Unknown arg '%s'.\n", arg);
+ return -1;
+ }
+ }
+
+ /* MUST specify input method. Yes there is only one input method. But
+ * requiring args is the simplest way to prevent damage in case someone (eg
+ * accidentally) invokes the utility wihout args. Further this also makes
+ * the utility easier to extend wihout breaking everything. */
+ if( bulkLn->dataFilePath == NULL ){
+ fprintf(stderr, "Arg '--stdin' missing. Try --help\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Like: mkdirs -p path
+ *
+ * WARN: Passed 'path' might be temporarily changed during execution.
+ * Nevertheless will be in original state after return.
+ */
+static int mkdirs( char*path, BulkLn*bulkLn ){
+ int err;
+ char *tmpEnd = path;
+ /* Backup original length so we later still can recognize the original end
+ * as we are going to place zeroes in the path during the run. */
+ char *pathEnd = path + strlen(path);
+
+ for(;;){
+ /* Loop for each slash in the path beginning from the topmost dir toward the
+ * innermost dir.
+ * This way our path gets longer by one segment each time we call 'mkdir' below.
+ * This is to have the same effect as we would call 'mkdir --parents' from cli.
+ * In other words we make sure every parent dir exists before creating the next
+ * one */
+
+ tmpEnd = strchr(tmpEnd + 1, '/');
+ if( tmpEnd == NULL ){
+ /* The last (innermost) segment to create a dir for */
+ tmpEnd = pathEnd;
+ }
+
+ /* Temporarily zero-terminate the path so we can create the parent dir
+ * up to there */
+ tmpEnd[0] = '\0';
+
+ /* Print if requested */
+ if( bulkLn->dryRun || bulkLn->isPrintEachMkdir ){
+ printf("mkdir(%s)\n", path);
+ }
+
+ /* Create dir up to that found path */
+ if( ! bulkLn->dryRun ){
+ /* Perform the real action */
+ /* mode gets masked by umask and after that we have the defaults
+ * (aka what we want) */
+ err = mkdir(path, 0777);
+ if( err ){
+ if( errno == EEXIST ){
+ // Fine :) So just continue with the next one.
+ }else{
+ fprintf(stderr, "mkdir(%s): %s\n", path, strerror(errno));
+ err = -1; goto finally;
+ }
+ }else{
+ /* Only increment the directory counter if we really created
+ * the dir. Eg if it did NOT already exist */
+ bulkLn->createdDirsCount += 1;
+ }
+ }
+
+ if( tmpEnd == pathEnd ){
+ /* Nothing to restore as we point to end-of-string.
+ * This also means we're done. So end the loop */
+ break;
+ }else{
+ /* Restore where we did cut-off the path */
+ tmpEnd[0] = '/';
+ /* Then loop to add and process one more segment */
+ }
+ }
+
+ err = 0;
+ finally:
+ return err;
+}
+
+
+/** Like: ln -f srcPath dstPath */
+static int createHardlink( char*srcPath, char*dstPath, BulkLn*bulkLn ){
+ int err;
+
+ if( bulkLn->dryRun || bulkLn->isPrintEachCreateLink ){
+ printf("link('%s', '%s')\n", srcPath, dstPath);
+ }
+
+ if( ! bulkLn->dryRun ){
+ /* Perform the real action */
+ if( bulkLn->isRelinkExistingFiles ){
+ /* Delete beforehand so we can be sure 'link' does not fail due
+ * already existing file */
+ err = unlink(dstPath);
+ if( err ){
+ if( errno == ENOENT ){
+ /* There is no such entry we could delete. So we're already
+ * fine :) */
+ }else{
+ /* Some other (unepxected) error */
+ fprintf(stderr, "unlink(%s): %s\n", dstPath, strerror(errno));
+ err = -1; goto finally;
+ }
+ }
+ }
+ err = link(srcPath, dstPath);
+ if( err ){
+ fprintf(stderr, "link('%s', '%s'): %s\n", srcPath, dstPath, strerror(errno));
+ err = -1; goto finally;
+ }
+ }
+
+ bulkLn->createdLinksCount += 1;
+
+ err = 0;
+ finally:
+ return err;
+}
+
+
+static int onPathPair( char*srcPath, char*dstPath, BulkLn*bulkLn ){
+ assert(srcPath != NULL);
+ assert(dstPath != NULL);
+ assert(bulkLn != NULL);
+ int err;
+
+ if( bulkLn->isPrintStatus && bulkLn->createdLinksCount % 10000 == 0 ){
+ fprintf(stderr, "Created %7d links so far.\n", bulkLn->createdLinksCount);
+ }
+
+ /* Search end of parent dir path */
+ char *tmpEnd = strrchr(dstPath, '/');
+ if( tmpEnd != NULL ){
+ /* Temporarily cut-off the last segment (filename) to create the
+ * parent-dirs */
+ tmpEnd[0] = '\0';
+ /* Create missing parent dirs */
+ err = mkdirs(dstPath, bulkLn);
+ if (err) { err = -1; goto finally; }
+ /* Restore path */
+ tmpEnd[0] = '/';
+ }
+
+ err = createHardlink(srcPath, dstPath, bulkLn);
+ if( err ){ err = -1; goto finally; }
+
+ err = 0;
+ finally:
+ return err;
+}
+
+
+static int parseDataFileAsPairPerLine( BulkLn*bulkLn ){
+ int err;
+ size_t buf_cap = 0;
+ size_t buf_len = 0;
+ char *buf = NULL;
+ size_t lineNum = 0;
+
+ for(;;){
+ lineNum += 1;
+
+ /* Read input line-by-line. Not the most elegant way to parse stuff,
+ * but should suffice for our use-case */
+ err = getline(&buf, &buf_cap, bulkLn->dataFd);
+ if(unlikely( err < 0 )){
+ /* Error handling */
+ if( feof(bulkLn->dataFd) ){
+ break; /* End-Of-File. Just break off the loop */
+ }else if( ferror(bulkLn->dataFd) ){
+ fprintf(stderr, "getline(%s): %s\n", bulkLn->dataFilePath, strerror(errno));
+ err = -1; goto finally;
+ }else{
+ abort(); /* I don't know how this could happen */
+ }
+ }
+ buf_len = err;
+
+ /* Extract the two paths from our line */
+ char *srcPath = buf;
+ char *tab = memchr(buf, DATA_FILE_FIELD_SEP_CHR, buf_len);
+ if( tab == NULL ){
+ fprintf(stderr, "Too few field separators (tab) in '%s' @ %lu",
+ bulkLn->dataFilePath, lineNum);
+ err = -1; goto finally;
+ }
+ char *unwantedTab = memchr(tab + 1, DATA_FILE_FIELD_SEP_CHR, tab + 1 - buf);
+ if( unwantedTab != NULL ){
+ fprintf(stderr, "Too many field separators (tab) in '%s' @ %lu",
+ bulkLn->dataFilePath, lineNum);
+ err = -1; goto finally;
+ }
+ char *dstPath = tab + 1; /* <- path starts one char after the separator */
+ char *dstPath_end = buf + buf_len;
+ for(;; --dstPath_end ){
+ if( dstPath_end < buf ){
+ fprintf(stderr, "IMHO cannot happen (@%s:%d)\n", __FILE__, __LINE__);
+ err = -1; goto finally;
+ }
+ if( dstPath_end[0]=='\n' || dstPath_end[0]=='\0' || dstPath_end[0]=='\r' ){
+ continue; /* last char not found yet */
+ }
+ /* 'dstPath_end' now points to the last char of our line. So add
+ * one to point to the 'end' */
+ dstPath_end += 1;
+ break;
+ }
+
+ /* Zero-Terminate the two strings */
+ tab[0] = '\0';
+ dstPath_end[0] = '\0';
+
+ /* Publish this pair for processing */
+ err = onPathPair(srcPath, dstPath, bulkLn);
+ if( err ){ err = -1; goto finally; }
+ }
+
+ if( bulkLn->isPrintStatus ){
+ fprintf(stderr, "Parsed %lu records from '%s'\n", lineNum, bulkLn->dataFilePath);
+ }
+
+ err = 0;
+ finally:
+ free(buf);
+ return err;
+}
+
+
+int bulk_ln_main( int argc, char**argv ){
+ int err;
+ BulkLn bulkLn = {0};
+ #define bulkLn (&bulkLn)
+
+ /* parse args */
+ err = parseArgs(argc, argv, bulkLn);
+ if( err ){ err = -1; goto finally; }
+
+ /* Open data source */
+ if( !strcmp(bulkLn->dataFilePath, "-") ){
+ bulkLn->dataFd = stdin;
+ }else{
+ bulkLn->dataFd = fopen(bulkLn->dataFilePath, "rb");
+ if( bulkLn->dataFd == NULL ){
+ fprintf(stderr, "fopen(%s): %s", bulkLn->dataFilePath, strerror(errno));
+ err = -1; goto finally;
+ }
+ }
+
+ err = parseDataFileAsPairPerLine(bulkLn);
+ if( err ){ err = -1; goto finally; }
+
+ if( bulkLn->isPrintSummary ){
+ fprintf(stderr, "Created %d directories and linked %d files.\n",
+ bulkLn->createdDirsCount, bulkLn->createdLinksCount);
+ }
+
+ err = 0;
+ finally:
+ if( bulkLn->dataFd != NULL && bulkLn->dataFd != stdin ){
+ fclose(bulkLn->dataFd); bulkLn->dataFd = NULL;
+ }
+ return err;
+ #undef bulkLn
+}
+
diff --git a/src/bulk_ln/bulk_ln.h b/src/bulk_ln/bulk_ln.h
new file mode 100644
index 0000000..6054386
--- /dev/null
+++ b/src/bulk_ln/bulk_ln.h
@@ -0,0 +1,10 @@
+#ifndef INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c
+#define INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c
+
+#include "commonbase.h"
+
+
+int bulk_ln_main( int argc, char**argv );
+
+
+#endif /* INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c */
diff --git a/src/bulk_ln/bulk_ln_main.c b/src/bulk_ln/bulk_ln_main.c
new file mode 100644
index 0000000..a3ac228
--- /dev/null
+++ b/src/bulk_ln/bulk_ln_main.c
@@ -0,0 +1,10 @@
+
+#include "bulk_ln.h"
+
+int main( int argc, char**argv ){
+ int err = bulk_ln_main(argc, argv);
+ /* Ensure to return 7-bit value only (See POSIX) */
+ if( err < 0 ) err = -err;
+ return err > 127 ? 1 : err;
+}
+
diff --git a/src/common/commonbase.h b/src/common/commonbase.h
new file mode 100644
index 0000000..497d3fa
--- /dev/null
+++ b/src/common/commonbase.h
@@ -0,0 +1,23 @@
+
+/*
+ * common config for project. Here goes stuff like feature-test-macros etc.
+ *
+ * Every header file MUST include this file AS THE 1ST include.
+ */
+
+
+#define _POSIX_C_SOURCE 200809L
+
+
+#define STR_QUOT_IAHGEWIH(s) #s
+#define STR_QUOT(s) STR_QUOT_IAHGEWIH(s)
+
+#define STR_CAT(a, b) a ## b
+
+#ifndef likely
+# define likely(a) (a)
+#endif
+#ifndef unlikely
+# define unlikely(a) (a)
+#endif
+