.PHONY: clean clean_profiles restore_originals

# See the makefiles in contrib/bolt and contrib/pgo-lto for more information.

# Settings taken from https://github.com/rust-lang/rust/blob/master/src/tools/opt-dist/src/bolt.rs
BOLT_ARGS :=
# Reorder basic blocks within functions
BOLT_ARGS += -reorder-blocks=ext-tsp
# Reorder functions within the binary
BOLT_ARGS += -reorder-functions=cdsort
# Split function code into hot and code regions
BOLT_ARGS += -split-functions
# Split as many basic blocks as possible
BOLT_ARGS += -split-all-cold
# Move jump tables to a separate section
BOLT_ARGS += -jump-tables=move
# Use regular size pages for code alignment
BOLT_ARGS += -no-huge-pages
# Fold functions with identical code
BOLT_ARGS += -icf=1
# Split using best available strategy (three-way splitting, Cache-Directed Sort)
# Disabled for libjulia-internal till https://github.com/llvm/llvm-project/issues/89508 is fixed
# BOLT_ARGS += -split-strategy=cdsplit
# Update DWARF debug info in the final binary
BOLT_ARGS += -update-debug-sections
# Print optimization statistics
BOLT_ARGS += -dyno-stats
# BOLT doesn't fully support computed gotos, https://github.com/llvm/llvm-project/issues/89117
# Use escaped regex as the name BOLT recognises is often a bit different, e.g. apply_cl/1(*2)
# This doesn't actually seem to do anything, the actual mitigation is not using --use-old-text
# which we do in the bolt target
BOLT_ARGS += -skip-funcs=.\*apply_cl.\*

# -fno-reorder-blocks-and-partition is needed on gcc >= 8.
BOLT_FLAGS := $\
	"BOLT_CFLAGS_GCC+=-fno-reorder-blocks-and-partition" $\
	"BOLT_LDFLAGS=-Wl,--emit-relocs"

STAGE0_BUILD:=$(CURDIR)/toolchain
STAGE1_BUILD:=$(CURDIR)/pgo-instrumented.build
STAGE2_BUILD:=$(CURDIR)/optimized.build

STAGE0_BINARIES:=$(STAGE0_BUILD)/usr/bin/
STAGE0_TOOLS:=$(STAGE0_BUILD)/usr/tools/

BOLT_PROFILE_DIR:=$(CURDIR)/profiles-bolt
PGO_PROFILE_DIR:=$(CURDIR)/profiles
PGO_PROFILE_FILE:=$(PGO_PROFILE_DIR)/merged.prof
PGO_PROFRAW_FILES:=$(wildcard $(PGO_PROFILE_DIR)/*.profraw)
JULIA_ROOT:=$(CURDIR)/../..

LLVM_BOLT:=$(STAGE0_BINARIES)llvm-bolt
LLVM_MERGEFDATA:=$(STAGE0_BINARIES)merge-fdata
LLVM_CXXFILT:=$(STAGE0_TOOLS)llvm-cxxfilt
LLVM_PROFDATA:=$(STAGE0_TOOLS)llvm-profdata
LLVM_OBJCOPY:=$(STAGE0_TOOLS)llvm-objcopy

# If you add new files to optimize, you need to add BOLT_LDFLAGS and BOLT_CFLAGS to the build of your new file.
SYMLINKS_TO_OPTIMIZE := libLLVM.so libjulia-internal.so libjulia-codegen.so
FILES_TO_OPTIMIZE := $(shell for file in $(SYMLINKS_TO_OPTIMIZE); do readlink $(STAGE1_BUILD)/usr/lib/$$file; done)

AFTER_INSTRUMENT_MESSAGE:='Run `make finish_stage2` to finish off the build. $\
	You can now optionally collect more profiling data by running Julia with an appropriate workload, $\
	if you wish, run `make clean_profiles` before doing so to remove any profiling data generated by `make finish_stage2`. $\
	You should end up with some data in $(BOLT_PROFILE_DIR). Afterwards run `make merge_data && make bolt`.'

# When building a single libLLVM.so we need to increase -vp-counters-per-site
# significantly
COUNTERS_PER_SITE:=6
# Note: profile counters are not atomic by default, https://discourse.llvm.org/t/profile-guided-optimization-pgo-related-questions-and-suggestions/75232/5

AFTER_STAGE1_MESSAGE:='You can now optionally collect more profiling data for use in PGO by running Julia $\
	with an appropriate workload. If you wish, run `make clean_profiles` before doing so to remove any profiling data $\
	generated by building Julia. You should end up with about 15MB of data in $(PGO_PROFILE_DIR). $\
	Note that running extensive scripts may result in counter overflows, which can be detected by running $\
	`make top`. Afterwards run `make stage2`.'

TOOLCHAIN_FLAGS = $\
	"CC=$(STAGE0_TOOLS)clang" $\
	"CXX=$(STAGE0_TOOLS)clang++" $\
	"LD=$(STAGE0_TOOLS)ld.lld" $\
	"AR=$(STAGE0_TOOLS)llvm-ar" $\
	"RANLIB=$(STAGE0_TOOLS)llvm-ranlib" $\
	"CFLAGS+=$(PGO_CFLAGS)" $\
	"CXXFLAGS+=$(PGO_CXXFLAGS)" $\
	"LDFLAGS+=-fuse-ld=lld $(PGO_LDFLAGS)"

$(STAGE0_BUILD) $(STAGE1_BUILD) $(STAGE2_BUILD):
	$(MAKE) -C $(JULIA_ROOT) O=$@ configure

stage0: export USE_BINARYBUILDER_LLVM=1
stage0: | $(STAGE0_BUILD)
	# Turn [cd]tors into init/fini_array sections in libclang_rt, since lld
	# doesn't do that, and otherwise the profile constructor is not executed
	$(MAKE) -C $(STAGE0_BUILD)/deps install-clang install-llvm install-lld install-llvm-tools install-BOLT && \
	find $< -name 'libclang_rt.profile-*.a' -exec $(LLVM_OBJCOPY) --rename-section .ctors=.init_array --rename-section .dtors=.fini_array {} + && \
	touch $@

$(STAGE1_BUILD): stage0
stage1: PGO_CFLAGS:=-fprofile-generate=$(PGO_PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
stage1: PGO_CXXFLAGS:=-fprofile-generate=$(PGO_PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
stage1: PGO_LDFLAGS:=-flto=thin -fprofile-generate=$(PGO_PROFILE_DIR)
stage1: export USE_BINARYBUILDER_LLVM=0
stage1: | $(STAGE1_BUILD)
	$(MAKE) -C $(STAGE1_BUILD) $(TOOLCHAIN_FLAGS) && touch $@
	@echo $(AFTER_STAGE1_MESSAGE)

stage2: PGO_CFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE)
stage2: PGO_CXXFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE)
stage2: PGO_LDFLAGS:=-flto=thin -fprofile-use=$(PGO_PROFILE_FILE) -Wl,--icf=safe
stage2: export USE_BINARYBUILDER_LLVM=0
stage2: $(PGO_PROFILE_FILE) | $(STAGE2_BUILD)
	$(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS) $(BOLT_FLAGS) julia-src-release julia-symlink julia-libccalltest \
								julia-libccalllazyfoo julia-libccalllazybar julia-libllvmcalltest && \
	touch $@

copy_originals: stage2
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \
		cp $$abs_file "$$abs_file.original"; \
	done && \
	touch $@

# I don't think there's any particular reason to have -no-huge-pages here, perhaps slightly more accurate profile data
# as the final build uses -no-huge-pages
# We reset the mtime of the files to prevent make from rebuilding targets depending on them.
bolt_instrument: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \
		old_time=$$(stat -c %Y $$abs_file); \
		$(LLVM_BOLT) "$$abs_file.original" -o $$abs_file --instrument --instrumentation-file-append-pid --instrumentation-file="$(BOLT_PROFILE_DIR)/$$file-prof" -no-huge-pages; \
		mkdir -p $$(dirname "$(BOLT_PROFILE_DIR)/$$file-prof"); \
		touch -d "@$$old_time" $$abs_file; \
		printf "\n"; \
	done && \
	touch $@
	@echo $(AFTER_INSTRUMENT_MESSAGE)

finish_stage2: PGO_CFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE)
finish_stage2: PGO_CXXFLAGS:=-fprofile-use=$(PGO_PROFILE_FILE)
finish_stage2: PGO_LDFLAGS:=-flto=thin -fprofile-use=$(PGO_PROFILE_FILE) -Wl,--icf=safe
finish_stage2: stage2
	$(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS)

merge_data: bolt_instrument
	for file in $(FILES_TO_OPTIMIZE); do \
		profiles=$(BOLT_PROFILE_DIR)/$$file-prof.*.fdata; \
		$(LLVM_MERGEFDATA) $$profiles > "$(BOLT_PROFILE_DIR)/$$file-prof.merged.fdata"; \
	done && \
	touch $@

# The --use-old-text saves about 16 MiB of libLLVM.so size.
# However, the rust folk found it succeeds very non-deterministically for them.
# It tries to reuse old text segments to reduce binary size
# BOLT doesn't fully support computed gotos https://github.com/llvm/llvm-project/issues/89117, so we cannot use --use-old-text on libjulia-internal
# That flag saves less than 1 MiB for libjulia-internal so oh well.
# We reset the mtime of the files to prevent make from rebuilding targets depending on them.
bolt: merge_data
	for file in $(FILES_TO_OPTIMIZE); do \
        abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \
		old_time=$$(stat -c %Y $$abs_file); \
		$(LLVM_BOLT) "$$abs_file.original" -data "$(BOLT_PROFILE_DIR)/$$file-prof.merged.fdata" -o $$abs_file $(BOLT_ARGS) $$(if [ "$$file" != $(shell readlink $(STAGE2_BUILD)/usr/lib/libjulia-internal.so) ]; then echo "--use-old-text -split-strategy=cdsplit"; fi); \
		touch -d "@$$old_time" $$abs_file; \
    done && \
    touch $@

clean_profiles:
	rm -rf $(PGO_PROFILE_DIR) $(BOLT_PROFILE_DIR)

clean:
	rm -f stage0 stage1 stage2 $(PGO_PROFILE_FILE) bolt copy_originals merge_data bolt_instrument

restore_originals: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \
		cp -P "$$abs_file.original" $$abs_file; \
	done

delete_originals: copy_originals
	for file in $(FILES_TO_OPTIMIZE); do \
		abs_file=$(STAGE2_BUILD)/usr/lib/$$file; \
		rm "$$abs_file.original"; \
	done

$(PGO_PROFILE_FILE): stage1 $(PGO_PROFRAW_FILES)
	$(LLVM_PROFDATA) merge -output=$@ $(PGO_PROFRAW_FILES)

# show top 50 functions
top: $(PGO_PROFILE_FILE)
	$(LLVM_PROFDATA) show --topn=50 $< | $(LLVM_CXXFILT)
