diff --git a/build.rs b/build.rs
index 53c7b15d00df6de31278953c3196c5404067540f..a0f1d4d5b9ee1042bfee77d5d42da447b69d007d 100644
--- a/build.rs
+++ b/build.rs
@@ -91,6 +91,26 @@ fn fill_from_location(f: &mut fs::File, loc: &Path) -> Result<(), Error> {
     Ok(())
 }
 
+#[cfg(not(target_arch = "x86_64"))]
+fn asm(_out_dir: &str) {}
+
+#[cfg(target_arch = "x86_64")]
+fn asm(out_dir: &str) {
+    use std::process::Command;
+
+    println!("cargo:rerun-if-changed=src/asm/x86_64/trampoline.asm");
+
+    let status = Command::new("nasm")
+        .arg("-f").arg("bin")
+        .arg("-o").arg(format!("{}/trampoline", out_dir))
+        .arg("src/asm/x86_64/trampoline.asm")
+        .status()
+        .expect("failed to run nasm");
+    if ! status.success() {
+        panic!("nasm failed with exit status {}", status);
+    }
+}
+
 fn main() {
     println!("cargo:rustc-env=TARGET={}", env::var("TARGET").unwrap());
     println!("cargo:rerun-if-env-changed=INITFS_FOLDER");
@@ -100,6 +120,8 @@ fn main() {
     let mut f = fs::File::create(&dest_path).unwrap();
     let src = env::var("INITFS_FOLDER");
 
+    asm(&out_dir);
+
     // Write header
     f.write_all(
         b"
diff --git a/src/acpi/madt.rs b/src/acpi/madt.rs
index 61396b7bc07c29b0fa287c733ae045f1fd47b45a..cae2189e9aedfa406fa921afe80c093c735a5838 100644
--- a/src/acpi/madt.rs
+++ b/src/acpi/madt.rs
@@ -5,7 +5,7 @@ use crate::paging::{ActivePageTable, Page, PhysicalAddress, VirtualAddress};
 use crate::paging::entry::EntryFlags;
 
 use super::sdt::Sdt;
-use super::{AP_STARTUP, TRAMPOLINE, find_sdt, load_table, get_sdt_signature};
+use super::{find_sdt, load_table, get_sdt_signature};
 
 use core::intrinsics::{atomic_load, atomic_store};
 use core::sync::atomic::Ordering;
@@ -22,6 +22,9 @@ pub struct Madt {
     pub flags: u32
 }
 
+const TRAMPOLINE: usize = 0x8000;
+static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
+
 pub static mut MADT: Option<Madt> = None;
 pub const FLAG_PCAT: u32 = 1;
 
@@ -52,13 +55,19 @@ impl Madt {
             }
 
             if cfg!(feature = "multi_core") {
+                // Map trampoline
                 let trampoline_frame = Frame::containing_address(PhysicalAddress::new(TRAMPOLINE));
                 let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
-
-                // Map trampoline
                 let result = active_table.map_to(trampoline_page, trampoline_frame, EntryFlags::PRESENT | EntryFlags::WRITABLE);
                 result.flush(active_table);
 
+                // Write trampoline, make sure TRAMPOLINE page is free for use
+                for i in 0..TRAMPOLINE_DATA.len() {
+                    unsafe {
+                        atomic_store((TRAMPOLINE as *mut u8).add(i), TRAMPOLINE_DATA[i]);
+                    }
+                }
+
                 for madt_entry in madt.iter() {
                     println!("      {:?}", madt_entry);
                     match madt_entry {
@@ -73,7 +82,7 @@ impl Madt {
                                 let stack_start = allocate_frames(64).expect("no more frames in acpi stack_start").start_address().get() + crate::KERNEL_OFFSET;
                                 let stack_end = stack_start + 64 * 4096;
 
-                                let ap_ready = TRAMPOLINE as *mut u64;
+                                let ap_ready = (TRAMPOLINE + 8) as *mut u64;
                                 let ap_cpu_id = unsafe { ap_ready.offset(1) };
                                 let ap_page_table = unsafe { ap_ready.offset(2) };
                                 let ap_stack_start = unsafe { ap_ready.offset(3) };
@@ -106,7 +115,7 @@ impl Madt {
                                 // Send START IPI
                                 {
                                     //Start at 0x0800:0000 => 0x8000. Hopefully the bootloader code is still there
-                                    let ap_segment = (AP_STARTUP >> 12) & 0xFF;
+                                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
                                     let mut icr = 0x4600 | ap_segment as u64;
 
                                     if local_apic.x2 {
diff --git a/src/acpi/mod.rs b/src/acpi/mod.rs
index 4764aa43791474a5b0060b8cc849e50da0bf5f17..25e64eb87002bb64eaf659a990b7043a94e12953 100644
--- a/src/acpi/mod.rs
+++ b/src/acpi/mod.rs
@@ -39,9 +39,6 @@ pub mod aml;
 mod rxsdt;
 mod rsdp;
 
-const TRAMPOLINE: usize = 0x7E00;
-const AP_STARTUP: usize = TRAMPOLINE + 512;
-
 pub fn get_sdt(sdt_address: usize, active_table: &mut ActivePageTable) -> &'static Sdt {
     {
         let page = Page::containing_address(VirtualAddress::new(sdt_address));
diff --git a/src/arch/x86_64/graphical_debug/display.rs b/src/arch/x86_64/graphical_debug/display.rs
index a88248be6bc7f24d62de6dda659997f87475a531..4f5ecbf8efe2ea707f862d9eb8d4507d2177891f 100644
--- a/src/arch/x86_64/graphical_debug/display.rs
+++ b/src/arch/x86_64/graphical_debug/display.rs
@@ -15,7 +15,7 @@ pub struct Display {
 impl Display {
     pub fn new(width: usize, height: usize, onscreen: usize) -> Display {
         let size = width * height;
-        let offscreen = unsafe { ::ALLOCATOR.alloc(Layout::from_size_align_unchecked(size * 4, 4096)) };
+        let offscreen = unsafe { crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(size * 4, 4096)) };
         unsafe { fast_set64(offscreen as *mut u64, 0, size/2) };
         Display {
             width: width,
@@ -144,6 +144,6 @@ impl Display {
 
 impl Drop for Display {
     fn drop(&mut self) {
-        unsafe { ::ALLOCATOR.dealloc(self.offscreen.as_mut_ptr() as *mut u8, Layout::from_size_align_unchecked(self.offscreen.len() * 4, 4096)) };
+        unsafe { crate::ALLOCATOR.dealloc(self.offscreen.as_mut_ptr() as *mut u8, Layout::from_size_align_unchecked(self.offscreen.len() * 4, 4096)) };
     }
 }
diff --git a/src/arch/x86_64/graphical_debug/mod.rs b/src/arch/x86_64/graphical_debug/mod.rs
index 081f567865c41849357fe0b9920261c6f00adc91..6ea97ed541ef276690e06eed9b6e31a0983cacf3 100644
--- a/src/arch/x86_64/graphical_debug/mod.rs
+++ b/src/arch/x86_64/graphical_debug/mod.rs
@@ -1,9 +1,9 @@
 use spin::Mutex;
 
-use memory::Frame;
-use paging::{ActivePageTable, Page, PhysicalAddress, VirtualAddress};
-use paging::entry::EntryFlags;
-use paging::mapper::MapperFlushAll;
+use crate::memory::Frame;
+use crate::paging::{ActivePageTable, Page, PhysicalAddress, VirtualAddress};
+use crate::paging::entry::EntryFlags;
+use crate::paging::mapper::MapperFlushAll;
 
 pub use self::debug::DebugDisplay;
 use self::display::Display;
@@ -54,13 +54,13 @@ pub fn init(active_table: &mut ActivePageTable) {
     {
         let size = width * height;
 
-        let onscreen = physbaseptr + ::KERNEL_OFFSET;
+        let onscreen = physbaseptr + crate::KERNEL_OFFSET;
         {
             let mut flush_all = MapperFlushAll::new();
             let start_page = Page::containing_address(VirtualAddress::new(onscreen));
             let end_page = Page::containing_address(VirtualAddress::new(onscreen + size * 4));
             for page in Page::range_inclusive(start_page, end_page) {
-                let frame = Frame::containing_address(PhysicalAddress::new(page.start_address().get() - ::KERNEL_OFFSET));
+                let frame = Frame::containing_address(PhysicalAddress::new(page.start_address().get() - crate::KERNEL_OFFSET));
                 let flags = EntryFlags::PRESENT | EntryFlags::NO_EXECUTE | EntryFlags::WRITABLE | EntryFlags::HUGE_PAGE;
                 let result = active_table.map_to(page, frame, flags);
                 flush_all.consume(result);
diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs
index 30425f0e2033b2adce9fe9b7ceefa3cc6c94841f..7cdec3d5de362568954768bba55feec4b14bebb9 100644
--- a/src/arch/x86_64/start.rs
+++ b/src/arch/x86_64/start.rs
@@ -10,7 +10,7 @@ use crate::allocator;
 #[cfg(feature = "acpi")]
 use crate::acpi;
 #[cfg(feature = "graphical_debug")]
-use arch::x86_64::graphical_debug;
+use crate::arch::x86_64::graphical_debug;
 use crate::arch::x86_64::pti;
 use crate::device;
 use crate::gdt;
diff --git a/src/asm/x86_64/trampoline.asm b/src/asm/x86_64/trampoline.asm
new file mode 100644
index 0000000000000000000000000000000000000000..39d89bcc083b0a594709cd1b6dff388981d1274b
--- /dev/null
+++ b/src/asm/x86_64/trampoline.asm
@@ -0,0 +1,174 @@
+; trampoline for bringing up APs
+; compiled with nasm by build.rs, and included in src/acpi/madt.rs
+
+ORG 0x8000
+SECTION .text
+USE16
+
+trampoline:
+    jmp short startup_ap
+    times 8 - ($ - trampoline) nop
+    .ready: dq 0
+    .cpu_id: dq 0
+    .page_table: dq 0
+    .stack_start: dq 0
+    .stack_end: dq 0
+    .code: dq 0
+
+startup_ap:
+    cli
+
+    xor ax, ax
+    mov ds, ax
+    mov es, ax
+    mov ss, ax
+
+    ; initialize stack to invalid value
+    mov sp, 0
+
+    ;cr3 holds pointer to PML4
+    mov edi, 0x70000
+    mov cr3, edi
+
+    ; Enable FPU
+    mov eax, cr0
+    and al, 11110011b ; Clear task switched (3) and emulation (2)
+    or al, 00100010b ; Set numeric error (5) monitor co-processor (1)
+    mov cr0, eax
+
+    ; 18: Enable OSXSAVE
+    ; 10: Unmasked SSE exceptions
+    ; 9: FXSAVE/FXRSTOR
+    ; 7: Page Global
+    ; 5: Page Address Extension
+    ; 4: Page Size Extension
+    mov eax, cr4
+    or eax, 1 << 18 | 1 << 10 | 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
+    mov cr4, eax
+
+    ; initialize floating point registers
+    fninit
+
+    ; load protected mode GDT
+    lgdt [gdtr]
+
+    mov ecx, 0xC0000080               ; Read from the EFER MSR.
+    rdmsr
+    or eax, 1 << 11 | 1 << 8          ; Set the Long-Mode-Enable and NXE bit.
+    wrmsr
+
+    ;enabling paging and protection simultaneously
+    mov ebx, cr0
+    ; 31: Paging
+    ; 16: write protect kernel
+    ; 0: Protected Mode
+    or ebx, 1 << 31 | 1 << 16 | 1
+    mov cr0, ebx
+
+    ; far jump to enable Long Mode and load CS with 64 bit segment
+    jmp gdt.kernel_code:long_mode_ap
+
+USE64
+long_mode_ap:
+    mov rax, gdt.kernel_data
+    mov ds, rax
+    mov es, rax
+    mov fs, rax
+    mov gs, rax
+    mov ss, rax
+
+    mov rcx, [trampoline.stack_end]
+    lea rsp, [rcx - 256]
+
+    mov rdi, trampoline.cpu_id
+
+    mov rax, [trampoline.code]
+    mov qword [trampoline.ready], 1
+    jmp rax
+
+struc GDTEntry
+    .limitl resw 1
+    .basel resw 1
+    .basem resb 1
+    .attribute resb 1
+    .flags__limith resb 1
+    .baseh resb 1
+endstruc
+
+attrib:
+    .present              equ 1 << 7
+    .ring1                equ 1 << 5
+    .ring2                equ 1 << 6
+    .ring3                equ 1 << 5 | 1 << 6
+    .user                 equ 1 << 4
+;user
+    .code                 equ 1 << 3
+;   code
+    .conforming           equ 1 << 2
+    .readable             equ 1 << 1
+;   data
+    .expand_down          equ 1 << 2
+    .writable             equ 1 << 1
+    .accessed             equ 1 << 0
+;system
+;   legacy
+    .tssAvailabe16        equ 0x1
+    .ldt                  equ 0x2
+    .tssBusy16            equ 0x3
+    .call16               equ 0x4
+    .task                 equ 0x5
+    .interrupt16          equ 0x6
+    .trap16               equ 0x7
+    .tssAvailabe32        equ 0x9
+    .tssBusy32            equ 0xB
+    .call32               equ 0xC
+    .interrupt32          equ 0xE
+    .trap32               equ 0xF
+;   long mode
+    .ldt32                equ 0x2
+    .tssAvailabe64        equ 0x9
+    .tssBusy64            equ 0xB
+    .call64               equ 0xC
+    .interrupt64          equ 0xE
+    .trap64               equ 0xF
+
+flags:
+    .granularity equ 1 << 7
+    .available equ 1 << 4
+;user
+    .default_operand_size equ 1 << 6
+;   code
+    .long_mode equ 1 << 5
+;   data
+    .reserved equ 1 << 5
+
+gdtr:
+    dw gdt.end + 1  ; size
+    dq gdt          ; offset
+
+gdt:
+.null equ $ - gdt
+    dq 0
+
+.kernel_code equ $ - gdt
+istruc GDTEntry
+    at GDTEntry.limitl, dw 0
+    at GDTEntry.basel, dw 0
+    at GDTEntry.basem, db 0
+    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code
+    at GDTEntry.flags__limith, db flags.long_mode
+    at GDTEntry.baseh, db 0
+iend
+
+.kernel_data equ $ - gdt
+istruc GDTEntry
+    at GDTEntry.limitl, dw 0
+    at GDTEntry.basel, dw 0
+    at GDTEntry.basem, db 0
+; AMD System Programming Manual states that the writeable bit is ignored in long mode, but ss can not be set to this descriptor without it
+    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable
+    at GDTEntry.flags__limith, db 0
+    at GDTEntry.baseh, db 0
+iend
+
+.end equ $ - gdt