00001 /* 00002 00003 This file is part of the FAST-ER machine learning system. 00004 Copyright (C) 2008 Edward Rosten and Los Alamos National Laboratory 00005 00006 This program is free software; you can redistribute it and/or modify 00007 it under the terms of the GNU General Public License as published by 00008 the Free Software Foundation; either version 2 of the License, or 00009 (at your option) any later version. 00010 00011 This program is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU General Public License for more details. 00015 00016 You should have received a copy of the GNU General Public License along 00017 with this program; if not, write to the Free Software Foundation, Inc., 00018 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00019 */ 00020 #include "faster_bytecode.h" 00021 #include <cvd/image.h> 00022 #include <cerrno> 00023 00024 ///\cond never 00025 using namespace CVD; 00026 using namespace std; 00027 ///\endcond 00028 00029 #ifdef JIT 00030 #include <sys/mman.h> 00031 ///This struct contains a x86 machine-code compiled version of the detector. The detector 00032 ///operates on a single row and inserts offset from the beginning of the image in to a 00033 ///std::vector. 00034 ///@ingroup gFastTree 00035 class jit_detector 00036 { 00037 public: 00038 00039 00040 ///Run the compiled detector on a row of an image. 00041 ///@param im The image. 00042 ///@param row The row to detect corners in. 00043 ///@param xmin The starting position. 00044 ///@param xmax The ending position. 00045 ///@param corners The detected corners as offsets from image.data(). 00046 ///@param threshold The corner detector threshold. 00047 void detect_in_row(const Image<byte>& im, int row, int xmin, int xmax, vector<int>& corners, int threshold) 00048 { 00049 00050 const byte* p = im[row] + xmin; 00051 const int n = xmax - xmin; 00052 void* cs = &corners; 00053 const void* im_data = im.data(); 00054 /* r/m usage, at entry to machine code 00055 In use: 00056 %ecx Num remaining 00057 %edi threshold 00058 %ebp Detect in row machine code procedure address 00059 %ebx cb 00060 %edx c_b 00061 %esi data 00062 %eax Scratch 00063 00064 4%esp %esi: produced automatically by call 00065 8%esp image.data() 00066 12%esp &vector<int> 00067 16%esp vector_inserter: simple function for calling member of std::vector 00068 00069 00070 Input: 00071 0 num remaining 00072 1 data pointer 00073 2 threshold 00074 3 proc 00075 4 push_back_proc 00076 5 vector<int> 00077 6 image.data() 00078 */ 00079 00080 __asm__ __volatile__( 00081 //Save all registers 00082 " pusha \n" 00083 00084 //Load operands in to correct places 00085 " pushl %4 \n" 00086 " pushl %5 \n" 00087 " pushl %6 \n" 00088 " movl %0, %%ecx \n" 00089 " movl %1, %%esi \n" 00090 " movl %2, %%edi \n" 00091 " movl %3, %%ebp \n" //%? uses ebp, so trash ebp last 00092 00093 00094 //Start the loop 00095 " cmp $0, %%ecx \n" 00096 " je 1 \n" 00097 " call *%%ebp \n" 00098 "1: \n" 00099 00100 00101 //Unload operands 00102 " popl %%eax \n" 00103 " popl %%eax \n" 00104 " popl %%eax \n" 00105 00106 //Restore all registers 00107 " popa \n" 00108 : 00109 : "m"(n), "m"(p), "m"(threshold), "m"(proc), "i"(&vector_inserter), "m"(cs), "m"(im_data) 00110 ); 00111 00112 00113 } 00114 00115 00116 ///Create a compiled detector from the bytecode. 00117 ///@param v Bytecode. 00118 jit_detector(const vector<block_bytecode::fast_detector_bit>& v) 00119 { 00120 //blocksize 00121 const int bs=28; 00122 00123 length = bs * (v.size() + 2); //Add head and tail block 00124 00125 /* The original assembler code looked like this 00126 This is now done in machine code, with the whole tree in 00127 place of line 0x804e0c1. 00128 00129 804e0b3: 83 f9 00 cmp $0x0,%ecx 00130 804e0b6: 74 1b je 804e0d3 <finished> 00131 00132 0804e0b8 <loop>: 00133 804e0b8: 0f b6 16 movzbl (%esi),%edx 00134 804e0bb: 89 d3 mov %edx,%ebx 00135 804e0bd: 29 fa sub %edi,%edx 00136 804e0bf: 01 fb add %edi,%ebx 00137 804e0c1: ff d5 call *%ebp 00138 804e0c3: a8 ff test $0xff,%al 00139 804e0c5: 74 08 je 804e0cf <nocorner> 00140 804e0c7: 56 push %esi 00141 804e0c8: 51 push %ecx 00142 804e0c9: ff 54 24 10 call *0x10(%esp) 00143 804e0cd: 59 pop %ecx 00144 804e0ce: 58 pop %eax 00145 00146 0804e0cf <nocorner>: 00147 804e0cf: 46 inc %esi 00148 804e0d0: 49 dec %ecx 00149 804e0d1: 75 e5 jne 804e0b8 <loop> //jne == jnz 00150 00151 Unused spaces are filled in with int $3, (instruction 0xcc), which 00152 causes a debug trap. Makes catching errors easier. 00153 00154 The consists of fixed sized blocks pasted together. The size is determined by the 00155 largest block, which is a tree node. This makes jump computation trivial, but 00156 it also means that short jumps are never used, and the code is therefore larger 00157 than necessary. 00158 00159 The rest have 0xcc filled in in the spare places. 00160 00161 The blocks are templates and have the relevant parts filled in prior to 00162 copying. 00163 00164 Each tree node (including leaves are represented by an entire block) 00165 00166 Detectod corners are inserted in to a vector<int> as the integer offset of the corner 00167 pixel from the beginning of the image 00168 */ 00169 00170 const unsigned char loop_head[bs] = 00171 { 00172 0xEB, 0x11, //jmp + 17 00173 00174 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, //dead space 00175 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, 00176 0xcc,0xcc,0xcc,0xcc,0xcc, 00177 00178 00179 0x0f, 0xb6, 0x16, //movzbl (%esi),%edx Load data 00180 0x89, 0xd3, //mov %edx,%ebx 00181 0x29, 0xfa, //sub %edi,%edx Compute c_b 00182 0x01, 0xfb, //add %edi,%ebx Compute cb 00183 }; 00184 const int loop_head_start=19; //Jump to here to continue loop 00185 00186 00187 unsigned char loop_tail[bs] = 00188 { 00189 0x56, //push %esi Functions seem to trash this otherwise 00190 0x51, //push %ecx Functions seem to trash this otherwise 00191 0xFF, 0x54, 0x24, 0x14, //call *16(%esp) Other arguments on the stack already 00192 0x59, //pop %ecx Clean stack 00193 0x58, //pop %eax ... 00194 00195 0x46, //inc %esi 00196 0x49, //dec %ecx 00197 0x0F, 0x85, 0xcc, 0xcc, 0xcc, 0xcc, //jnz <back to first block> 00198 00199 0xc3, //ret 00200 0xcc,0xcc,0xcc,0xcc, //dead space 00201 0xcc,0xcc,0xcc,0xcc, 00202 0xcc,0xcc,0xcc, 00203 }; 00204 const int loop_tail_address_offset = 12; //fill in the jump <back to first block> address here 00205 const int loop_tail_jump_delta = 16; //Jump block_size*depth + this, to loop. 00206 const int loop_tail_entry = 8; //jump to here to avoid inserting current point as corner 00207 00208 unsigned char cont_or_goto[bs] = 00209 { 00210 0xE9,0xcc, 0xcc, 0xcc, 0xcc, //Jump to end of loop 00211 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, //dead space 00212 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, 00213 0xcc,0xcc,0xcc,0xcc,0xcc,0xcc, 00214 0xcc,0xcc,0xcc,0xcc,0xcc 00215 }; 00216 const int cont_jmp_addr = 1; //Jump address filled in here 00217 const int cont_delta = 5; //This much in addition to block delta 00218 00219 unsigned char branch[bs] = 00220 { 00221 0x0f, 0xB6, 0x86, 0xcc, 0xcc, 0xcc, 0xcc, //movzbl OOOO(%esi),%eax 00222 0x39, 0xd8, //cmp %ebx, %eax (eax - ebx) = (data[##]-cb 00223 0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc, //jg XXXX jmp by XXXX if eax > ebx 00224 0x39, 0xC2, //cmp %eax, %edx (edx - eax) = c_b - data[##] 00225 0x0F, 0x8F, 0xcc, 0xcc, 0xcc, 0xcc, //jg YYYY jmp by YYYY if ecx > ebx 00226 0xE9, 0xcc, 0xcc, 0xcc, 0xcc, //jmp ZZZZ Unconditional jump to ZZZZ 00227 }; 00228 const int block_off_off = 3; 00229 const int block_gt_off = 11; 00230 const int block_lt_off = 19; 00231 const int block_eq_off = 24; 00232 00233 00234 //mmap a writable, executable block of memory for JITted code 00235 proc = (unsigned char*) mmap(0, length, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); 00236 if(proc == MAP_FAILED) 00237 { 00238 cerr << "mmap failed with error: " << strerror(errno) << endl; 00239 exit(1); 00240 } 00241 00242 //Copy in the loop head: no parts to be filled in. 00243 memcpy(proc, loop_head, bs); 00244 00245 for(int i=0; i < (int)v.size(); i++) 00246 { 00247 if(v[i].lt == 0) // leaf 00248 { 00249 if(v[i].gt == 0) //Fill in jump to continue part 00250 { 00251 *(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size()- i) - cont_delta + loop_tail_entry; 00252 } 00253 else //fill in jump to insert part 00254 { 00255 *(int*)(cont_or_goto + cont_jmp_addr) = bs * (v.size() - i) - cont_delta; 00256 } 00257 00258 00259 memcpy(proc + (i+1)*bs, cont_or_goto, bs); 00260 } 00261 else 00262 { 00263 *(int*)(branch+block_off_off) = v[i].offset; 00264 00265 //Optimization: leaf nodes have a non-conditional goto in them 00266 //so goto the right place directly, rather than the leaf node. 00267 //This has a 5% effect or so, so bigger gains elsewhere. 00268 //Removed for simplicity. 00269 00270 *(int*)(branch+block_gt_off) = (v[i].gt -i) * bs - (block_gt_off + 4); 00271 *(int*)(branch+block_lt_off) = (v[i].lt -i) * bs - (block_lt_off + 4); 00272 *(int*)(branch+block_eq_off) = (v[i].eq -i) * bs - (block_eq_off + 4); 00273 00274 memcpy(proc + (i+1)*bs, branch, bs); 00275 } 00276 } 00277 00278 //Insert the correct backwards jump for looping 00279 *(int*)(loop_tail+loop_tail_address_offset) = -bs * (1+v.size()) - loop_tail_jump_delta + loop_head_start; 00280 memcpy(proc + bs * (v.size() + 1), loop_tail, bs); 00281 00282 } 00283 00284 ///Destroy object, unmapping executable memory. 00285 ~jit_detector() 00286 { 00287 munmap(proc, length); 00288 } 00289 00290 private: 00291 ///Prevent copying 00292 void operator=(const jit_detector&); 00293 ///Prevent copying 00294 jit_detector(const jit_detector&); 00295 00296 unsigned char* proc; ///< The machine code is stored in this mmap() allocated data which allows code execution. 00297 int length; ///< Number of mmap() allocated bytes. 00298 00299 ///Callback function to allow insertion in to std::vector. The execution of this function 00300 ///relies on the stack having the following layout (stack head on the left): 00301 ///@code 00302 ///return_address first_arg second_arg etc... 00303 ///@endcode 00304 ///so that the arguemnts directly reflect the stack layout. 00305 ///For speed, and in order to minimize stack handling, the argument list spans two call instructions worth of stack. 00306 /// 00307 ///@param ecx_dummy Pushed by the machine code, since the ABI allows ecx to be trashed 00308 ///@param p The pointer to the current pixel. Pushed by the machine code. 00309 ///@param esp_return_dummy Location to return to on a return from the machine code. Generated by the assembler call in to the machine code. 00310 ///@param im_data Pointer to the first image pixel. Pushed by the assembler caller. 00311 ///@param i Pointer to the std::vector<int> which stores the data. Pushed by the assembler caller. 00312 static void vector_inserter(int ecx_dummy, const byte* p, const void* esp_return_dummy, const byte* im_data, vector<int>* i) 00313 { 00314 i->push_back(p-im_data); 00315 } 00316 }; 00317 #endif 00318 00319 ///Detect corners in an image. The width of the image must match the width the 00320 ///detector was compiled to (using tree_elemeent::make_fast_detector for the 00321 ///results to make sense. The bytecode is JIT coimpiled if possible. 00322 ///@param im The image in which to detect corners 00323 ///@param corners Detected corners are inserted in to this container. 00324 ///@param threshold Corner detector threshold to use 00325 ///@param xmin x coordinate to start at. 00326 ///@param ymin y coordinate to start at. 00327 ///@param xmax x coordinate to go up to. 00328 ///@param ymax y coordinate to go up to. 00329 void block_bytecode::detect(const CVD::Image<CVD::byte>& im, std::vector<int>& corners, int threshold, int xmin, int xmax, int ymin, int ymax) 00330 { 00331 #ifdef JIT 00332 jit_detector jit(d); 00333 for(int y = ymin; y < ymax; y++) 00334 jit.detect_in_row(im, y, xmin, xmax, corners, threshold); 00335 #else 00336 cerr << "Hello!\n"; 00337 for(int y = ymin; y < ymax; y++) 00338 for(int x=xmin; x < xmax; x++) 00339 if(detect_no_score(&im[y][x], threshold)) 00340 corners.push_back(&im[y][x] - im.data()); 00341 #endif 00342 } 00343