Add from existing sources after some cleanup.

author: Andreas Fankhauser (@tux-six) 2022-05-30 16:13:08 +0200
committer: Andreas Fankhauser (@tux-six) 2022-05-30 16:13:08 +0200
commit: 3f8ac6551aa0f53fd14b216e97e79939635b8b82 (patch)
tree: d71d6b71cd5781692ca95fe938de630bd286e647
parent: 4d54d900cd2cfb880b5cada3b9baef6a30a04332 (diff)
download: bulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.zip
bulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.tar.gz
7 files changed, 555 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..796b96d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f3abfb7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,81 @@
+
+CC=gcc
+LD=ld
+AR=ar
+TAR=tar
+BINEXT=
+TOOLS=lxGcc64
+
+ifndef PROJECT_VERSION
+	# We just provide a primitive version string so we can see which build
+	# we're using while debugging. For a release we will override the version
+	# by providing it from cli args to 'make' as in:
+	#     make clean package PROJECT_VERSION=1.2.3
+	PROJECT_VERSION=$(shell date +0.0.0-%Y%m%d.%H%M%S)
+endif
+
+CFLAGS= --std=c99                                                             \
+	-Wall -Wextra -Werror -fmax-errors=3                                      \
+	-Wno-error=unused-function -Wno-error=unused-label                        \
+	-Wno-error=unused-variable -Wno-error=unused-parameter                    \
+	-Wno-error=unused-const-variable                                          \
+	-Werror=implicit-fallthrough=1                                            \
+	-Wno-error=unused-but-set-variable                                        \
+	-Wno-unused-function -Wno-unused-parameter                                \
+	-DPROJECT_VERSION=$(PROJECT_VERSION)
+
+LDFLAGS= -Wl,--no-demangle,--fatal-warnings
+
+INCDIRS= -Isrc/bulk_ln -Isrc/common
+
+ifndef NDEBUG
+	CFLAGS := $(CFLAGS) -ggdb -O0 -g3
+else
+	CFLAGS := $(CFLAGS) -ffunction-sections -fdata-sections -Os "-DNDEBUG=1"
+	LDFLAGS := $(LDFLAGS) -Wl,--gc-sections,--as-needed
+endif
+
+
+default: link package
+
+.PHONY: clean
+clean:
+	@echo "\n[INFO ] Clean"
+	rm -rf build dist
+
+.PHONY: link
+link: build/bin/bulk-ln$(BINEXT)
+
+build/obj/%.o: src/%.c
+	@echo "\n[INFO ] Compile '$@'"
+	@mkdir -p $(shell dirname build/obj/$*)
+	$(CC) -c -o $@ $< $(CFLAGS) $(INCDIRS)
+
+build/bin/bulk-ln$(BINEXT): \
+		build/obj/bulk_ln/bulk_ln.o \
+		build/obj/bulk_ln/bulk_ln_main.o
+	@echo "\n[INFO ] Link '$@'"
+	@mkdir -p $(shell dirname $@)
+	$(CC) -o $@ $(LDFLAGS) $^ $(LIBSDIR)
+
+.PHONY: package
+package: link
+	@echo "\n[INFO ] Package"
+	@rm -rf build/dist-* dist
+	@mkdir dist
+	@echo
+	@bash -c 'if [[ -n `git status --porcelain` ]]; then echo "[ERROR] Worktree not clean as it should be (see: git status)"; exit 1; fi'
+	@# Create Executable bundle.
+	@rm -rf build/dist-bin && mkdir -p build/dist-bin
+	@cp -t build/dist-bin \
+		README*
+	@mkdir build/dist-bin/bin
+	@cp -t build/dist-bin/bin \
+		build/bin/*$(BINEXT)
+	@(cd build/dist-bin && find . -type f -not -name MD5SUM -exec md5sum -b {} \;) > build/MD5SUM
+	@mv build/MD5SUM build/dist-bin/.
+	@(cd build/dist-bin && $(TAR) --owner=0 --group=0 -czf ../../dist/BulkLn-$(PROJECT_VERSION)-$(TOOLS).tgz *)
+	@echo "\n[INFO ] DONE: Artifacts created and placed in 'dist'."
+	@echo
+	@echo See './dist/' for result.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..57904bd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+
+BulkLn
+======
+
+'ln' like tool to handle large amount of links.
+
+Usage:  bulk-ln --help
+
diff --git a/src/bulk_ln/bulk_ln.c b/src/bulk_ln/bulk_ln.c
new file mode 100644
index 0000000..7c7e98a
--- /dev/null
+++ b/src/bulk_ln/bulk_ln.c
@@ -0,0 +1,422 @@
+
+/* Header for this file */
+#include "bulk_ln.h"
+ 
+/* System */
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h> // TODO remove
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* Other packages from this project */
+/*#include ""*/
+
+
+#define DATA_FILE_FIELD_SEP_CHR '\t'
+
+
+typedef  struct BulkLn  BulkLn;
+
+
+/* Root context */
+struct BulkLn {
+
+    /** File (path) where we read the paths-to-link from. The special
+     * value - (dash) means to use the stdin stream. */
+    char *dataFilePath;
+
+    FILE *dataFd;
+
+    /** If a dry-run got requested */
+    bool dryRun;
+
+    /** If true, we will print to stdout for every link we create (like we do
+     * in dry run) */
+    bool isPrintEachCreateLink;
+
+    /** print every call we make to mkdir to stdout. Usually only useful for
+     * debugging */
+    bool isPrintEachMkdir;
+
+    /** if true, we periodically print some kind of progress to stderr */
+    bool isPrintStatus;
+
+    /** if true, we print a summary what we did at the end of the run to stderr */
+    bool isPrintSummary;
+
+    /** if true, we will override existing files. This will be on for example
+     * if --force got specified. */
+    bool isRelinkExistingFiles;
+
+    /** Count of links we created */
+    int createdLinksCount;
+
+    /** Count of direcrories we created */
+    int createdDirsCount;
+};
+
+
+
+void printHelp(){
+    printf("\n   %s%s\n", strrchr(__FILE__, '/') + 1, " @ " STR_QUOT(PROJECT_VERSION) "\n"
+        "\n"
+        "Utility to create links. Writing a custom implementation of 'ln'\n"
+        "got necessary as we found no way to instruct 'ln' to create a few thousand\n"
+        "links in an acceptable amount of time. So we just wrote our own ;)\n"
+        "\n"
+        "Takes paths (pairwise) from stdin (see --stdin for details) and creates a\n"
+        "hardlink for each pair from the 1st path to the 2nd.\n"
+        "\n"
+        "Options:\n"
+        "\n"
+        "    --stdin\n"
+        "        Read the path pairs to link from stdin. The format is like:\n"
+        "\n"
+        "          <src-path> <tab> <dst-path> <newline>\n"
+        "\n"
+        "        Example:\n"
+        "\n"
+        "          origin/foo.txt\tnew/gugg.txt\n"
+        "          origin/bar.txt\tnew/da.txt\n"
+        "\n"
+        "        HINT: Preferred <newline> is LF. But CRLF should work too.\n"
+        "\n"
+        "    --quiet\n"
+        "        Don't print status or similar stuff. Errors will still be printed to\n"
+        "        stderr.\n"
+        "\n"
+        "    --verbose\n"
+        "        Print stupid amount of logs. Usually only helpful for debugging. Should\n"
+        "        NOT be combined with --quiet as this would be nonsense anyway.\n"
+        "\n"
+        "    --dry-run\n"
+        "        Will print the actions to stdout instead executing them.\n"
+        "        HINT: The directory count in the summary will be implicitly set to zero,\n"
+        "        as our used counting strategy would deliver wrong results when we not\n"
+        "        actually creating the dirs.\n"
+        "\n"
+        "    --force\n"
+        "        Same meaning as in original 'ln' command.\n"
+        "\n");
+}
+
+
+/** returns non-zero on errors. Error messages will already be printed
+ * internally. */
+int parseArgs( int argc, char**argv, BulkLn*bulkLn ){
+    /* init (aka set defaults) */
+    bulkLn->dataFilePath = NULL;
+    bulkLn->dryRun = 0;
+    bulkLn->isPrintEachCreateLink = 0;
+    bulkLn->isPrintEachMkdir = 0;
+    bulkLn->isPrintStatus = !0;
+    bulkLn->isPrintSummary = !0;
+    bulkLn->isRelinkExistingFiles = 0;
+
+    // Parse args
+    for( int i=1 ; i<argc ; ++i ) {
+        char *arg = argv[i];
+        if( !strcmp(arg, "--help") ){
+            printHelp();
+            return -1;
+        }
+        else if( !strcmp(arg, "--dry-run") ){
+            bulkLn->dryRun = !0;
+        }
+        else if( !strcmp(arg, "--force") ){
+            bulkLn->isRelinkExistingFiles = !0;
+        }
+        else if( !strcmp(arg, "--quiet") ){
+            bulkLn->isPrintStatus = false;
+            bulkLn->isPrintSummary = false;
+        }
+        else if( !strcmp(arg, "--stdin") ){
+            bulkLn->dataFilePath = "-";
+        }
+        else if( !strcmp(arg, "--verbose") ){
+            bulkLn->isPrintStatus = !0;
+            bulkLn->isPrintSummary = !0;
+            bulkLn->isPrintEachCreateLink = !0;
+            bulkLn->isPrintEachMkdir = !0;
+        }
+        else{
+            fprintf(stderr, "Unknown arg '%s'.\n", arg);
+            return -1;
+        }
+    }
+
+    /* MUST specify input method. Yes there is only one input method. But
+     * requiring args is the simplest way to prevent damage in case someone (eg
+     * accidentally) invokes the utility wihout args. Further this also makes
+     * the utility easier to extend wihout breaking everything. */
+    if( bulkLn->dataFilePath == NULL ){
+        fprintf(stderr, "Arg '--stdin' missing. Try  --help\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+
+/**
+ * Like:  mkdirs -p path
+ *
+ * WARN: Passed 'path' might be temporarily changed during execution.
+ * Nevertheless will be in original state after return.
+ */
+static int mkdirs( char*path, BulkLn*bulkLn ){
+    int err;
+    char *tmpEnd = path;
+    /* Backup original length so we later still can recognize the original end
+     * as we are going to place zeroes in the path during the run. */
+    char *pathEnd = path + strlen(path);
+
+    for(;;){
+        /* Loop for each slash in the path beginning from the topmost dir toward the
+         * innermost dir.
+         * This way our path gets longer by one segment each time we call 'mkdir' below.
+         * This is to have the same effect as we would call 'mkdir --parents' from cli.
+         * In other words we make sure every parent dir exists before creating the next
+         * one */
+
+        tmpEnd = strchr(tmpEnd + 1, '/');
+        if( tmpEnd == NULL ){
+            /* The last (innermost) segment to create a dir for */
+            tmpEnd = pathEnd;
+        }
+
+        /* Temporarily zero-terminate the path so we can create the parent dir
+         * up to there */
+        tmpEnd[0] = '\0';
+
+        /* Print if requested */
+        if( bulkLn->dryRun || bulkLn->isPrintEachMkdir ){
+            printf("mkdir(%s)\n", path);
+        }
+
+        /* Create dir up to that found path */
+        if( ! bulkLn->dryRun ){
+            /* Perform the real action */
+            /* mode gets masked by umask and after that we have the defaults
+             * (aka what we want) */
+            err = mkdir(path, 0777);
+            if( err ){
+                if( errno == EEXIST ){
+                    // Fine :) So just continue with the next one.
+                }else{
+                    fprintf(stderr, "mkdir(%s): %s\n", path, strerror(errno));
+                    err = -1; goto finally;
+                }
+            }else{
+                /* Only increment the directory counter if we really created
+                 * the dir. Eg if it did NOT already exist */
+                bulkLn->createdDirsCount += 1;
+            }
+        }
+
+        if( tmpEnd == pathEnd ){
+            /* Nothing to restore as we point to end-of-string.
+             * This also means we're done. So end the loop */
+            break;
+        }else{
+            /* Restore where we did cut-off the path */
+            tmpEnd[0] = '/';
+            /* Then loop to add and process one more segment */
+        }
+    }
+
+    err = 0;
+    finally:
+    return err;
+}
+
+
+/** Like:  ln -f srcPath dstPath  */
+static int createHardlink( char*srcPath, char*dstPath, BulkLn*bulkLn ){
+    int err;
+
+    if( bulkLn->dryRun || bulkLn->isPrintEachCreateLink ){
+        printf("link('%s', '%s')\n", srcPath, dstPath);
+    }
+
+    if( ! bulkLn->dryRun ){
+        /* Perform the real action */
+        if( bulkLn->isRelinkExistingFiles ){
+            /* Delete beforehand so we can be sure 'link' does not fail due
+             * already existing file */
+            err = unlink(dstPath);
+            if( err ){
+                if( errno == ENOENT ){
+                    /* There is no such entry we could delete. So we're already
+                     * fine :) */
+                }else{
+                    /* Some other (unepxected) error */
+                    fprintf(stderr, "unlink(%s): %s\n", dstPath, strerror(errno));
+                    err = -1; goto finally;
+                }
+            }
+        }
+        err = link(srcPath, dstPath);
+        if( err ){
+            fprintf(stderr, "link('%s', '%s'): %s\n", srcPath, dstPath, strerror(errno));
+            err = -1; goto finally;
+        }
+    }
+
+    bulkLn->createdLinksCount += 1;
+
+    err = 0;
+    finally:
+    return err;
+}
+
+
+static int onPathPair( char*srcPath, char*dstPath, BulkLn*bulkLn ){
+    assert(srcPath != NULL);
+    assert(dstPath != NULL);
+    assert(bulkLn != NULL);
+    int err;
+
+    if( bulkLn->isPrintStatus && bulkLn->createdLinksCount % 10000 == 0 ){
+        fprintf(stderr, "Created %7d links so far.\n", bulkLn->createdLinksCount);
+    }
+
+    /* Search end of parent dir path */
+    char *tmpEnd = strrchr(dstPath, '/');
+    if( tmpEnd != NULL ){
+        /* Temporarily cut-off the last segment (filename) to create the
+         * parent-dirs */
+        tmpEnd[0] = '\0';
+        /* Create missing parent dirs */
+        err = mkdirs(dstPath, bulkLn);
+        if (err) { err = -1; goto finally; }
+        /* Restore path */
+        tmpEnd[0] = '/';
+    }
+
+    err = createHardlink(srcPath, dstPath, bulkLn);
+    if( err ){ err = -1; goto finally; }
+
+    err = 0;
+    finally:
+    return err;
+}
+
+
+static int parseDataFileAsPairPerLine( BulkLn*bulkLn ){
+    int err;
+    size_t buf_cap = 0;
+    size_t buf_len = 0;
+    char *buf = NULL;
+    size_t lineNum = 0;
+
+    for(;;){
+        lineNum += 1;
+
+        /* Read input line-by-line. Not the most elegant way to parse stuff,
+         * but should suffice for our use-case */
+        err = getline(&buf, &buf_cap, bulkLn->dataFd);
+        if(unlikely( err < 0 )){
+            /* Error handling */
+            if( feof(bulkLn->dataFd) ){
+                break; /* End-Of-File. Just break off the loop */
+            }else if( ferror(bulkLn->dataFd) ){
+                fprintf(stderr, "getline(%s): %s\n", bulkLn->dataFilePath, strerror(errno));
+                err = -1; goto finally;
+            }else{
+                abort(); /* I don't know how this could happen */
+            }
+        }
+        buf_len = err;
+
+        /* Extract the two paths from our line */
+        char *srcPath = buf;
+        char *tab = memchr(buf, DATA_FILE_FIELD_SEP_CHR, buf_len);
+        if( tab == NULL ){
+            fprintf(stderr, "Too few field separators (tab) in '%s' @ %lu",
+                    bulkLn->dataFilePath, lineNum);
+            err = -1; goto finally;
+        }
+        char *unwantedTab = memchr(tab + 1, DATA_FILE_FIELD_SEP_CHR, tab + 1 - buf);
+        if( unwantedTab != NULL ){
+            fprintf(stderr, "Too many field separators (tab) in '%s' @ %lu",
+                    bulkLn->dataFilePath, lineNum);
+            err = -1; goto finally;
+        }
+        char *dstPath = tab + 1; /* <- path starts one char after the separator */
+        char *dstPath_end = buf + buf_len;
+        for(;; --dstPath_end ){
+            if( dstPath_end < buf ){
+                fprintf(stderr, "IMHO cannot happen (@%s:%d)\n", __FILE__, __LINE__);
+                err = -1; goto finally;
+            }
+            if( dstPath_end[0]=='\n' || dstPath_end[0]=='\0' || dstPath_end[0]=='\r' ){
+                continue; /* last char not found yet */
+            }
+            /* 'dstPath_end' now points to the last char of our line. So add
+             * one to point to the 'end' */
+            dstPath_end += 1;
+            break;
+        }
+
+        /* Zero-Terminate the two strings */
+        tab[0] = '\0';
+        dstPath_end[0] = '\0';
+
+        /* Publish this pair for processing */
+        err = onPathPair(srcPath, dstPath, bulkLn);
+        if( err ){ err = -1; goto finally; }
+    }
+
+    if( bulkLn->isPrintStatus ){
+        fprintf(stderr, "Parsed %lu records from '%s'\n", lineNum, bulkLn->dataFilePath);
+    }
+
+    err = 0;
+    finally:
+    free(buf);
+    return err;
+}
+
+
+int bulk_ln_main( int argc, char**argv ){
+    int err;
+    BulkLn bulkLn = {0};
+    #define bulkLn (&bulkLn)
+
+    /* parse args */
+    err = parseArgs(argc, argv, bulkLn);
+    if( err ){ err = -1; goto finally; }
+
+    /* Open data source */
+    if( !strcmp(bulkLn->dataFilePath, "-") ){
+        bulkLn->dataFd = stdin;
+    }else{
+        bulkLn->dataFd = fopen(bulkLn->dataFilePath, "rb");
+        if( bulkLn->dataFd == NULL ){
+            fprintf(stderr, "fopen(%s): %s", bulkLn->dataFilePath, strerror(errno));
+            err = -1; goto finally;
+        }
+    }
+
+    err = parseDataFileAsPairPerLine(bulkLn);
+    if( err ){ err = -1; goto finally; }
+
+    if( bulkLn->isPrintSummary ){
+        fprintf(stderr, "Created %d directories and linked %d files.\n",
+                bulkLn->createdDirsCount, bulkLn->createdLinksCount);
+    }
+
+    err = 0;
+    finally:
+    if( bulkLn->dataFd != NULL && bulkLn->dataFd != stdin ){
+        fclose(bulkLn->dataFd); bulkLn->dataFd = NULL;
+    }
+    return err;
+    #undef bulkLn
+}
+
diff --git a/src/bulk_ln/bulk_ln.h b/src/bulk_ln/bulk_ln.h
new file mode 100644
index 0000000..6054386
--- /dev/null
+++ b/src/bulk_ln/bulk_ln.h
@@ -0,0 +1,10 @@
+#ifndef INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c
+#define INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c
+
+#include "commonbase.h"
+
+
+int bulk_ln_main( int argc, char**argv );
+
+
+#endif /* INCGUARD_87768d11a9f7d831f2a8b9d0ab7efa9c */
diff --git a/src/bulk_ln/bulk_ln_main.c b/src/bulk_ln/bulk_ln_main.c
new file mode 100644
index 0000000..a3ac228
--- /dev/null
+++ b/src/bulk_ln/bulk_ln_main.c
@@ -0,0 +1,10 @@
+
+#include "bulk_ln.h"
+
+int main( int argc, char**argv ){
+    int err = bulk_ln_main(argc, argv);
+    /* Ensure to return 7-bit value only (See POSIX) */
+    if( err < 0 ) err = -err;
+    return err > 127 ? 1 : err;
+}
+
diff --git a/src/common/commonbase.h b/src/common/commonbase.h
new file mode 100644
index 0000000..497d3fa
--- /dev/null
+++ b/src/common/commonbase.h
@@ -0,0 +1,23 @@
+
+/*
+ * common config for project. Here goes stuff like feature-test-macros etc.
+ *
+ * Every header file MUST include this file AS THE 1ST include.
+ */
+
+
+#define _POSIX_C_SOURCE 200809L 
+
+
+#define STR_QUOT_IAHGEWIH(s) #s
+#define STR_QUOT(s) STR_QUOT_IAHGEWIH(s)
+
+#define STR_CAT(a, b) a ## b
+
+#ifndef likely
+#   define likely(a) (a)
+#endif
+#ifndef unlikely
+#   define unlikely(a) (a)
+#endif
+
author	Andreas Fankhauser (@tux-six)	2022-05-30 16:13:08 +0200
committer	Andreas Fankhauser (@tux-six)	2022-05-30 16:13:08 +0200
commit	3f8ac6551aa0f53fd14b216e97e79939635b8b82 (patch)
tree	d71d6b71cd5781692ca95fe938de630bd286e647
parent	4d54d900cd2cfb880b5cada3b9baef6a30a04332 (diff)
download	bulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.zip bulk-ln-3f8ac6551aa0f53fd14b216e97e79939635b8b82.tar.gz