.PHONY: clean clean_profiles restore_originals

# Settings taken from https://github.com/rust-lang/rust/blob/master/src/tools/opt-dist/src/bolt.rs
BOLT_ARGS :=
# Reorder basic blocks within functions
BOLT_ARGS += -reorder-blocks=ext-tsp
# Reorder functions within the binary
BOLT_ARGS += -reorder-functions=cdsort
# Split function code into hot and code regions
BOLT_ARGS += -split-functions
# Split as many basic blocks as possible
BOLT_ARGS += -split-all-cold
# Move jump tables to a separate section
BOLT_ARGS += -jump-tables=move
# Use regular size pages for code alignment
BOLT_ARGS += -no-huge-pages
# Fold functions with identical code
BOLT_ARGS += -icf=1
# Split using best available strategy (three-way splitting, Cache-Directed Sort)
# Disabled for libjulia-internal till https://github.com/llvm/llvm-project/issues/89508 is fixed
# BOLT_ARGS += -split-strategy=cdsplit
# Update DWARF debug info in the final binary
BOLT_ARGS += -update-debug-sections
# Print optimization statistics
BOLT_ARGS += -dyno-stats
# BOLT doesn't fully support computed gotos, https://github.com/llvm/llvm-project/issues/89117
# Use escaped regex as the name BOLT recognises is often a bit different, e.g. apply_cl/1(*2)
# This doesn't actually seem to do anything, the actual mitigation is not using --use-old-text
# which we do in the bolt target
BOLT_ARGS += -skip-funcs=.\*apply_cl.\*

# -fno-reorder-blocks-and-partition is needed on gcc >= 8.
BOLT_FLAGS := $\
	"BOLT_CFLAGS_GCC+=-fno-reorder-blocks-and-partition" $\
	"BOLT_LDFLAGS=-Wl,--emit-relocs"

STAGE0_BUILD:=$(CURDIR)/toolchain
STAGE1_BUILD:=$(CURDIR)/optimized.build

STAGE0_BINARIES:=$(STAGE0_BUILD)/usr/bin/

PROFILE_DIR:=$(CURDIR)/profiles-bolt
JULIA_ROOT:=$(CURDIR)/../..

LLVM_BOLT:=$(STAGE0_BINARIES)llvm-bolt
LLVM_MERGEFDATA:=$(STAGE0_BINARIES)merge-fdata

# If you add new files to optimize, you need to add BOLT_LDFLAGS and BOLT_CFLAGS to the build of your new file.
SYMLINKS_TO_OPTIMIZE := libLLVM.so libjulia-internal.so libjulia-codegen.so
FILES_TO_OPTIMIZE := $(shell for file in $(SYMLINKS_TO_OPTIMIZE); do readlink $(STAGE1_BUILD)/usr/lib/$$file; done)

AFTER_INSTRUMENT_MESSAGE:='Run `make finish_stage1` to finish off the build. $\
	You can now optionally collect more profiling data by running Julia with an appropriate workload, $\
	if you wish, run `make clean_profiles` before doing so to remove any profiling data generated by `make finish_stage1`. $\
	You should end up with some data in $(PROFILE_DIR). Afterwards run `make merge_data && make bolt`.'

$(STAGE0_BUILD) $(STAGE1_BUILD):
	$(MAKE) -C $(JULIA_ROOT) O=$@ configure

stage0: | $(STAGE0_BUILD)
	$(MAKE) -C $(STAGE0_BUILD)/deps install-BOLT && \
	touch $@

# Build with our custom flags, binary builder doesn't use them so we need to build LLVM for now.
# We manually skip package image creation so that we can profile it
$(STAGE1_BUILD): stage0
stage1: export USE_BINARYBUILDER_LLVM=0
stage1: | $(STAGE1_BUILD)
	$(MAKE) -C $(STAGE1_BUILD) $(BOLT_FLAGS) julia-src-release julia-symlink julia-libccalltest \
								julia-libccalllazyfoo julia-libccalllazybar julia-libllvmcalltest && \
	touch $@

copy_originals: stage1
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \
		cp $$abs_file "$$abs_file.original"; \
	done && \
	touch $@

# I don't think there's any particular reason to have -no-huge-pages here, perhaps slightly more accurate profile data
# as the final build uses -no-huge-pages
# We reset the mtime of the files to prevent make from rebuilding targets depending on them.
bolt_instrument: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \
		old_time=$$(stat -c %Y $$abs_file); \
		$(LLVM_BOLT) "$$abs_file.original" -o $$abs_file --instrument --instrumentation-file-append-pid --instrumentation-file="$(PROFILE_DIR)/$$file-prof" -no-huge-pages; \
		mkdir -p $$(dirname "$(PROFILE_DIR)/$$file-prof"); \
		touch -d "@$$old_time" $$abs_file; \
		printf "\n"; \
	done && \
	touch $@
	@echo $(AFTER_INSTRUMENT_MESSAGE)

finish_stage1: stage1
	$(MAKE) -C $(STAGE1_BUILD)

merge_data: bolt_instrument
	for file in $(FILES_TO_OPTIMIZE); do \
		profiles=$(PROFILE_DIR)/$$file-prof.*.fdata; \
		$(LLVM_MERGEFDATA) $$profiles > "$(PROFILE_DIR)/$$file-prof.merged.fdata"; \
	done && \
	touch $@

# The --use-old-text saves about 16 MiB of libLLVM.so size.
# However, the rust folk found it succeeds very non-deterministically for them.
# It tries to reuse old text segments to reduce binary size
# BOLT doesn't fully support computed gotos https://github.com/llvm/llvm-project/issues/89117, so we cannot use --use-old-text on libjulia-internal
# That flag saves less than 1 MiB for libjulia-internal so oh well.
# We reset the mtime of the files to prevent make from rebuilding targets depending on them.
bolt: merge_data
	for file in $(FILES_TO_OPTIMIZE); do \
        abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \
		old_time=$$(stat -c %Y $$abs_file); \
		$(LLVM_BOLT) "$$abs_file.original" -data "$(PROFILE_DIR)/$$file-prof.merged.fdata" -o $$abs_file $(BOLT_ARGS) $$(if [ "$$file" != $(shell readlink $(STAGE1_BUILD)/usr/lib/libjulia-internal.so) ]; then echo "--use-old-text -split-strategy=cdsplit"; fi); \
		touch -d "@$$old_time" $$abs_file; \
    done && \
    touch $@

clean_profiles:
	rm -rf $(PROFILE_DIR)

clean:
	rm -f stage0 stage1 bolt copy_originals merge_data bolt_instrument

restore_originals: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \
		cp -P "$$abs_file.original" $$abs_file; \
	done

delete_originals: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE1_BUILD)/usr/lib/$$file; \
		rm "$$abs_file.original"; \
	done
