incubator-heron icon indicating copy to clipboard operation
incubator-heron copied to clipboard

Duplicate files installed bumping up install size by ~190M

Open Code0x58 opened this issue 4 years ago • 2 comments

The current install is ~816M as seen in the docker images, with ~20% duplicate files. It may be worth cleaning up the install (preferably by reworking the install script) if anyone wants to reduce container sizes:

(
    cd /usr/local;
    join -j1 -t' ' \
        <(find . -type f | xargs md5sum --binary | cut -d' ' -f1 | sort | uniq --repeated) \
        <(find . -type f | xargs md5sum --binary | sort) \
    | awk 'last != $1 {
            printf ": "
            system( "echo " size * (count - 1) " | numfmt --to=iec" );

            cmd="stat -c %s " substr($2, 2);
            cmd | getline size;
            last=$1;
            count=0;
        }
        
        {
            count++;
            printf "%s ", substr($2, 2);
        }
        
        END {
            printf "total duplicated size: "
            system( "echo " size * (count - 1) " | numfmt --to=iec" );
        }
        '
) | sort -t: -h -k2

Gives a total of 190M duplicated:

: 0
./heron/conf/examples/roundrobin_packing.yaml ./heron/conf/mesos/client.yaml : 800
./heron/conf/local/client.yaml ./heron/conf/sandbox/client.yaml : 984
./heron/conf/aurora/client.yaml ./heron/conf/marathon/client.yaml : 1.1K
./heron/conf/kubernetes/uploader.yaml ./heron/conf/nomad/uploader.yaml : 1.2K
./heron/conf/slurm/statemgr.yaml ./heron/conf/yarn/statemgr.yaml : 1.2K
./heron/include/spout/irich-spout.h ./heron/include/topology/irich-spout.h : 1.3K
./heron/include/spout/base-rich-spout.h ./heron/include/topology/base-rich-spout.h : 1.5K
./heron/bin/heron-apiserver ./heron/bin/heron-apiserver.sh : 1.6K
./heron/conf/kubernetes/stateful.yaml ./heron/conf/nomad/stateful.yaml : 1.6K
./heron/conf/localzk/client.yaml ./heron/conf/slurm/client.yaml ./heron/conf/yarn/client.yaml : 1.8K
./heron/include/spout/ispout-output-collector.h ./heron/include/topology/ispout-output-collector.h : 2.5K
./heron/conf/local/scheduler.yaml ./heron/conf/localzk/scheduler.yaml ./heron/conf/sandbox/scheduler.yaml : 2.7K
./heron/bin/heron-downloader-config ./heron/bin/heron-downloader-config.sh ./heron/dist/heron-core/bin/heron-downloader-config ./heron/dist/heron-core/bin/heron-downloader-config.sh : 2.8K
./heron/conf/local/stateful.yaml ./heron/conf/localzk/stateful.yaml ./heron/conf/sandbox/stateful.yaml : 3.3K
./heron/bin/heron-downloader ./heron/bin/heron-downloader.sh ./heron/dist/heron-core/bin/heron-downloader ./heron/dist/heron-core/bin/heron-downloader.sh : 3.6K
./heron/conf/local/statemgr.yaml ./heron/conf/marathon/statemgr.yaml ./heron/conf/mesos/statemgr.yaml ./heron/conf/sandbox/statemgr.yaml : 4.0K
./heron/conf/local/healthmgr.yaml ./heron/conf/sandbox/healthmgr.yaml ./heron/conf/yarn/healthmgr.yaml : 4.4K
./heron/conf/local/uploader.yaml ./heron/conf/localzk/uploader.yaml ./heron/conf/mesos/uploader.yaml ./heron/conf/sandbox/uploader.yaml ./heron/conf/slurm/uploader.yaml : 4.7K
./heron/conf/marathon/stateful.yaml ./heron/conf/mesos/stateful.yaml ./heron/conf/slurm/stateful.yaml ./heron/conf/yarn/stateful.yaml : 4.9K
./heron/include/spout/ispout.h ./heron/include/topology/ispout.h : 5.6K
./heron/conf/aurora/downloader.yaml ./heron/conf/examples/downloader.yaml ./heron/conf/kubernetes/downloader.yaml ./heron/conf/local/downloader.yaml ./heron/conf/localzk/downloader.yaml ./heron/conf/nomad/downloader.yaml ./heron/conf/sandbox/downloader.yaml ./heron/conf/yarn/downloader.yaml total duplicated size: 7.7K
./heron/conf/aurora/packing.yaml ./heron/conf/kubernetes/packing.yaml ./heron/conf/local/packing.yaml ./heron/conf/localzk/packing.yaml ./heron/conf/marathon/packing.yaml ./heron/conf/mesos/packing.yaml ./heron/conf/nomad/packing.yaml ./heron/conf/sandbox/packing.yaml ./heron/conf/slurm/packing.yaml ./heron/conf/yarn/packing.yaml : 9.1K
./heron/conf/local/heron_internals.yaml ./heron/conf/sandbox/heron_internals.yaml : 13K
./heron/conf/aurora/heron_internals.yaml ./heron/conf/examples/heron_internals.yaml ./heron/conf/localzk/heron_internals.yaml ./heron/conf/marathon/heron_internals.yaml ./heron/conf/mesos/heron_internals.yaml ./heron/conf/slurm/heron_internals.yaml ./heron/conf/yarn/heron_internals.yaml : 74K
./heron/conf/aurora/metrics_sinks.yaml ./heron/conf/local/metrics_sinks.yaml ./heron/conf/localzk/metrics_sinks.yaml ./heron/conf/marathon/metrics_sinks.yaml ./heron/conf/mesos/metrics_sinks.yaml ./heron/conf/sandbox/metrics_sinks.yaml ./heron/conf/slurm/metrics_sinks.yaml ./heron/conf/yarn/metrics_sinks.yaml : 76K
./heron/dist/heron-core/lib/statemgr/heron-localfs-statemgr.jar ./heron/lib/statemgr/heron-localfs-statemgr.jar : 5.7M
./heron/dist/heron-core/lib/scheduler/heron-scheduler.jar ./heron/lib/scheduler/heron-scheduler.jar : 7.8M
./heron/dist/heron-core/lib/scheduler/heron-local-scheduler.jar ./heron/lib/scheduler/heron-local-scheduler.jar : 8.0M
./heron/dist/heron-core/lib/scheduler/heron-marathon-scheduler.jar ./heron/lib/scheduler/heron-marathon-scheduler.jar : 8.0M
./heron/dist/heron-core/lib/scheduler/heron-slurm-scheduler.jar ./heron/lib/scheduler/heron-slurm-scheduler.jar : 8.0M
./heron/dist/heron-core/lib/statemgr/heron-zookeeper-statemgr.jar ./heron/lib/statemgr/heron-zookeeper-statemgr.jar : 11M
./heron/dist/heron-core/lib/packing/heron-binpacking-packing.jar ./heron/lib/packing/heron-binpacking-packing.jar ./heron/lib/scheduler/heron-binpacking-packing.jar : 12M
./heron/dist/heron-core/lib/packing/heron-roundrobin-packing.jar ./heron/lib/packing/heron-roundrobin-packing.jar ./heron/lib/scheduler/heron-roundrobin-packing.jar : 12M
./heron/dist/heron-core/lib/metricscachemgr/heron-metricscachemgr.jar ./heron/lib/metricscachemgr/heron-metricscachemgr.jar : 13M
./heron/dist/heron-core/lib/scheduler/heron-mesos-scheduler.jar ./heron/lib/scheduler/heron-mesos-scheduler.jar : 14M
./heron/dist/heron-core/lib/scheduler/heron-nomad-scheduler.jar ./heron/lib/scheduler/heron-nomad-scheduler.jar : 15M
./heron/dist/heron-core/lib/scheduler/heron-kubernetes-scheduler.jar ./heron/lib/scheduler/heron-kubernetes-scheduler.jar : 36M
./heron/dist/heron-core/lib/downloaders/heron-downloader.jar ./heron/lib/downloaders/heron-downloader.jar : 45M

most of which (182.9M) is between /usr/local/heron/lib/ and /usr/local/heron/dist/heron-core/lib/

$ (
    for dir in /usr/local/heron/lib/ /usr/local/heron/dist/heron-core/lib/; do
        (cd "$dir" && find . -type f -exec sh -c 'ls -sh {} | tail -n1' \; )
    done
) | sort | uniq --count --repeated
      2 11M ./statemgr/heron-zookeeper-statemgr.jar
      2 13M ./metricscachemgr/heron-metricscachemgr.jar
      2 14M ./scheduler/heron-mesos-scheduler.jar
      2 15M ./scheduler/heron-nomad-scheduler.jar
      2 36M ./scheduler/heron-kubernetes-scheduler.jar
      2 45M ./downloaders/heron-downloader.jar
      2 5.7M ./packing/heron-binpacking-packing.jar
      2 5.7M ./packing/heron-roundrobin-packing.jar
      2 5.7M ./statemgr/heron-localfs-statemgr.jar
      2 7.8M ./scheduler/heron-scheduler.jar
      2 8.0M ./scheduler/heron-local-scheduler.jar
      2 8.0M ./scheduler/heron-marathon-scheduler.jar
      2 8.0M ./scheduler/heron-slurm-scheduler.jar

Code0x58 avatar Jan 24 '21 00:01 Code0x58

Is the script accounting for symlinks? I believe some of the contents in /heron are symlinks to another location.

nicknezis avatar Jan 24 '21 18:01 nicknezis

Is the script accounting for symlinks? I believe some of the contents in /heron are symlinks to another location.

Good question, but not in these cases. find doesn't follow symlinks by default and the first script has both finds starting from the same place so that won't find duplicates. The second starts at different places, but there are no symlinks in the path of /usr/local/heron/lib and /usr/local/heron/dist/heron-core/lib.

Here's all the symlinks that exist in case it helps a clean up:

# made during heron-install.sh
ln -s /usr/local/heron/bin/heron{,-apiserver,-explorer,-tracker,-ui} /usr/local/bin/

# made in the Dockerfile
ln -s /usr/local/heron/dist/heron-core /heron/
ln -s /usr/local/heron/{examples,release.yaml} /heron/
ln -s /usr/local/heron/{bin,conf,dist,lib,release.yaml /heron/heron-tools/

Code0x58 avatar Jan 24 '21 19:01 Code0x58