#!/usr/bin/make -f
export CXX=clang++
export CC=clang
#export HIPCXX=clang++
#export HIPFLAGS=-std=c++23 -Wno-c++20-extensions
export PATH:=/usr/lib/llvm-22/bin:$(PATH)

export PYBUILD_NAME=ck4inductor
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
export DEB_CFLAGS_MAINT_APPEND  = -Wall -pedantic
export DEB_LDFLAGS_MAINT_APPEND = -Wl,-O1
FULL_TEST ?= 0

# filter incompatible options from affecting device code
CXXFLAGS := $(subst -fstack-protector-strong,-Xarch_host -fstack-protector-strong,$(CXXFLAGS))
CXXFLAGS := $(subst -fcf-protection,-Xarch_host -fcf-protection,$(CXXFLAGS))

ifdef CCACHE_DIR
CMAKE_CACHE_FLAGS = \
	-DENABLE_CCACHE_GEMM=ON -DENABLE_CCACHE_GEMM_PRESHUFFLE=ON \
	-DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
CCACHE=ccache
endif

CKCXXDEVFLAGS=-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=512
CKSTDFLAGS=-std=c++23 -Wno-c++20-extensions

# Could also add -Wno-#pragma-messages

# Looks like default _FORTIFY_SOURCE=2 stucks build in an infinity loop
# https://clang.debian.net/status.php?version=13.0.0&key=BUILD_TIMEOUT
# Indeed disabling _FORTIFY_SOURCE allow a reasonnable amount of memory to build
FORTIFLAG0=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0" \
	-DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0"
FORTIFLAG1=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=1" \
	-DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=1"
FORTIFLAGX=-DCMAKE_C_FLAGS="-U_FORTIFY_SOURCE" \
	-DCMAKE_CXX_FLAGS="-U_FORTIFY_SOURCE"
FORTIFLAG=$(FORTIFLAGX)

#-std=c++23
DEV_FLAGS=-DUSE_BITINT_EXTENSION_INT4=OFF -DBUILD_DEV=ON

# For arches see https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
ROCM_TARGET_ARCH_FIXED="gfx11-generic;gfx12-generic"
GPU_ARCHS_LIST=$(shell rocm-target-arch)
HIP_PLATFORM="amd"

CMAKE_FLAGS = \
	-DCMAKE_BUILD_TYPE=Release \
	-DHIP_PLATFORM=$(HIP_PLATFORM) \
	-DMIOPEN_REQ_LIBS_ONLY=ON \
	-DGPU_ARCHS=$(ROCM_TARGET_ARCH_FIXED) \
	-DCMAKE_CXX_COMPILER=clang++ \
	$(CMAKE_CACHE_FLAGS) \
	$(FORTIFLAG)

CMAKE_TST_FLAGS = \
	-DCMAKE_BUILD_TYPE=Release \
	-DHIP_PLATFORM=$(HIP_PLATFORM) \
	-DBUILD_TESTING=ON \
	-DSKIP_BROKEN_EXAMPLE=ON \
	-DCMAKE_CXX_COMPILER=clang++ \
	-DSKIP_LONG_BUILD=ON \
	$(CMAKE_CACHE_FLAGS) \
	$(FORTIFLAG)


#	-DCMAKE_HIP_STANDARD=23 \

# The next make not building tests

# From CMakeLists.txt
# In order to build just the CK library (without tests and examples) for all supported GPU targets
#	-D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"

# In order to build CK along with all tests and examples it should be OK
# to set GPU_TARGETS to just 1 or 2 similar architectures.
#
# From TheRock builds
#	-DGPU_TARGETS="gfx1100;gfx1101;gfx1102"

# When using GPU_TARGETS the build will only work if the architectures are similar
# 	-DGPU_TARGETS="gfx908;gfx90a"
#	-DGPU_TARGETS="gfx1100;gfx1101;gfx1102"

# None of the following worked (more target than managed for GPU_TARGETS
# GPU_ARCHS disable tests
#	-DGPU_TARGETS="$(shell rocm-target-arch --sep ';')"
#	-DGPU_ARCHS="$(shell rocm-target-arch --sep ';')"

# From The Rock
# -GNinja
# -DCMAKE_BUILD_TYPE=Release
# -DHIP_PLATFORM=amd
# -DBUILD_TESTING=ON
# -DMIOPEN_REQ_LIBS_ONLY=ON
# -DGPU_TARGETS="gfx1100;gfx1101;gfx1102"

# LP_ENV flag: 1 for Launchpad environment (uses fixed thread count)
#              0 for other environments (calculates thread count based on available CPUs)
LP_ENV = 0

NPROC := $(shell nproc)

ifeq ($(LP_ENV),1)
# Launchpad environment: use fixed thread count
THREADS_BASE := 1
THREADS_DOUBLE := 1
else
# Other environments: calculate based on available CPUs
THREADS_BASE := $(shell echo $$(($(NPROC) / 8)))
THREADS_BASE := $(shell [ $(THREADS_BASE) -lt 1 ] && echo 1 || echo $(THREADS_BASE))
THREADS_DOUBLE := $(shell echo $$(($(THREADS_BASE) * 2)))
endif

%:
	dh $@

override_dh_auto_configure-arch:
	mkdir -p obj-$(DEB_HOST_GNU_TYPE)/_deps
	[ -L obj-$(DEB_HOST_GNU_TYPE)/_deps/gtest-src ] || \
		ln -s /usr/src/googletest obj-$(DEB_HOST_GNU_TYPE)/_deps/gtest-src
	dh_auto_configure -O--buildsystem=cmake+ninja -- $(CMAKE_FLAGS)
# Full test requires huge amount of memory/cpu at the moment
# The composable kernel upstream build system is work in progress
# Keep this here as it was tested and may be usefull in the future
ifeq ($(FULL_TEST),1)
	for arch in $(GPU_ARCHS_LIST); do \
		echo "Configure test for $$arch" ; \
		mkdir -p build/$$arch/_deps ; \
		[ -L build/$$arch/_deps/gtest-src ] || \
			ln -s /usr/src/googletest build/$$arch/_deps/gtest-src ; \
		dh_auto_configure --buildsystem=cmake+ninja --sourcedir=$(CURDIR) \
		--builddir=build/$$arch -- $(CMAKE_TST_FLAGS) -DGPU_TARGETS="$$arch" ; \
	done
endif

override_dh_auto_configure-indep:
	dh_auto_configure -O--buildsystem=pybuild

override_dh_auto_build:
	@echo "=== Starting build with heartbeat monitoring ==="
	@(while true; do echo "#"; sleep 1m; done) & HB_PID=$$!; \
	trap "kill $$HB_PID 2>/dev/null || true; echo ''; echo '=== Heartbeat stopped ==='" EXIT; \
	set -e; \
	dh_auto_build -O--buildsystem=pybuild; \
	dh_auto_build -O--buildsystem=cmake+ninja -- -j$(THREADS_BASE); \
	if [ "$(FULL_TEST)" = "1" ]; then \
		for arch in $(GPU_ARCHS_LIST); do \
			echo "===== Building for $$arch ====="; \
			dh_auto_build --buildsystem=cmake+ninja --builddir=build/$$arch -- -j$(THREADS_DOUBLE) examples tests || exit 1; \
		done; \
	fi; \
	echo "=== Build completed successfully ==="

# .o files take around ~28GB which cause launchpad builders
# to fail due to 'No space left on device'
execute_before_dh_auto_install-arch:
	find $(CURDIR) -name '*.o' -delete
	:

execute_after_dh_auto_build-indep: export http_proxy=127.0.0.1:9
execute_after_dh_auto_build-indep: export https_proxy=127.0.0.1:9
execute_after_dh_auto_build-indep:
ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS)))
	perl -pi -e 's/^FULL_PATH_NAMES.*/FULL_PATH_NAMES = NO/' docs/doxygen/Doxyfile
	perl -pi -e 's|^STRIP_FROM_PATH.*|STRIP_FROM_PATH = $(CURDIR)/|' docs/doxygen/Doxyfile
	perl -pi -e 's/^TIMESTAMP.*/TIMESTAMP = NO/' docs/doxygen/Doxyfile
	export DOXYGEN_STRIP_FROM_PATH=$(CURDIR)/; \
	rocm-docs-build
	rm -f build/html/doxygen/html/jquery.js
	find build/html -name Composable-Kernel-math.html \
		-exec perl -ni -e 'print unless /cdn\.jsdelivr\.net/' {} \;
	find build/html -name Composable-Kernel-Glossary.html \
		-exec perl -ni -e 'print unless /cdn\.jsdelivr\.net/' {} \;
	find build/html -type f -name '*.html' -exec sed -i 's|$(CURDIR)/||g' {} +
endif

override_dh_auto_install:
ifeq ($(FULL_TEST),1)
	for arch in $(GPU_ARCHS_LIST); do \
		make -C $(CURDIR)/build/$$arch/test install DESTDIR=$(CURDIR)/build/cktests/$$arch ; \
		make -C $(CURDIR)/build/$$arch/example install DESTDIR=$(CURDIR)/build/cktests/$$arch ; \
		cd $(CURDIR)/build/$$arch ; find test -name CTestTestfile.cmake | cpio -pdumv $(CURDIR)/build/cktests/$$arch ; \
		cd $(CURDIR)/build/$$arch ; find example -name CTestTestfile.cmake | cpio -pdumv $(CURDIR)/build/cktests/$$arch ; \
		find $(CURDIR)/build/cktests/$$arch -type f -name 'CTestTestfile.cmake' -exec sed -i "s|$(CURDIR)/build/$$arch/bin|/usr/libexec/rocm/libcomposable-kernel-tests/$$arch/usr/bin|g" {} + ; \
		find $(CURDIR)/build/cktests/$$arch -type f -name 'CTestTestfile.cmake' -exec sed -i "s|$(CURDIR)||g" {} + ; \
	done
endif
	dh_auto_install -O--buildsystem=cmake+ninja -O--builddirectory=$(CURDIR)/obj-$(DEB_HOST_GNU_TYPE)
	dh_auto_install --destdir=debian/python3-ck4inductor/ -O--buildsystem=pybuild
	# Fix the README location in the temporary install directory
	if [ -f debian/tmp/usr/include/ck/README.md ]; then \
		mkdir -p debian/libcomposable-kernel-dev/usr/share/doc/libcomposable-kernel-dev/; \
		mv debian/tmp/usr/include/ck/README.md \
			debian/libcomposable-kernel-dev/usr/share/doc/libcomposable-kernel-dev/README.ck-headers.md; \
	fi
	find debian -type f -name LICENSE -exec rm -f {} \;

override_dh_clean:
	rm -rf example/ck_tile/01_fmha/codegen/__pycache__/ \
		example/ck_tile/01_fmha/codegen/ops/__pycache__/ \
		tile_engine/ops/gemm/__pycache__/ \
		build/  \
		docs/_doxygen/ \
		docs/doxygen/html/ \
		docs/doxygen/xml/ \
		docs/sphinx/_toc.yml \
		rocm_composable_kernel.egg-info/ \
		.pybuild/
	dh_clean

override_dh_strip:
	dh_strip --no-automatic-dbgsym

override_dh_fixperms:
	dh_fixperms
	find debian -name 'CTestTestfile.cmake' -exec chmod -x {} \;

override_dh_gencontrol:
	dh_gencontrol -- -Vrocm:GPU-Architecture=$(subst ;, ,$(ROCM_TARGET_ARCH_FIXED))
