#!/usr/bin/env python

chunk = 1048576 * 4

# http://www.obrador.com/essentialjpeg/headerinfo.htm
start_of_image = soi = '\xff\xd8\xff\xe0'
jfif_id = 'JFIF\x00'
diffie_quant_marker = '\xff\xdb'
diffie_huffman_marker = '\xff\xc4'
frame_marker = '\xff\xc0'
scan_marker = '\xff\xda'
comment_marker = '\xff\xee'
end_of_image = eoi = '\xff\xd9'

def extra_check(string):
    """An extra check to make sure we're looking at a jpeg file..."""
    return jfif_id in string[:11]

def slice_image(img):
    """Find the EOI marker assuming we are at the beginning of a jpeg file."""
    dqm_loc = img.find(diffie_quant_marker)
    dhm_loc = img.find(diffie_huffman_marker, dqm_loc)
    frm_loc = img.find(frame_marker, dhm_loc)
    smk_loc = img.find(scan_marker, frm_loc)
    com_loc = img.find(comment_marker, smk_loc)
    eoi_loc = img.find(end_of_image, com_loc)
    return img[:eoi_loc+2]

def generate_jpeg_files(f):
    """A generator that spits out strings that match jpeg files.  `f` is a
    file opened in binary mode."""
    eof = False
    s = ''
    while not eof:
        while soi not in s:
            s = f.read(chunk)
            if not s:
                eof = True
                break
        img_loc = s.find(soi)
        img = s[img_loc:]
        if len(img) < 11:
            extra = f.read(chunk)
            img += extra
            s += extra
        if not extra_check(img):
            # hmm.. it wasn't a jpeg after all, continue
            s = s[img_loc+1:]
            continue
        image = slice_image(img)
        s = s[img_loc + len(image):]
        yield image


def find_all_images(filename, threshold=None):
    f = open(filename, 'rb')
    image_generator = generate_jpeg_files(f)
    for num,img in enumerate(image_generator):
        ifile = open('potential_image_%04d.jpg' % num, 'wb')
        ifile.write(img)
        ifile.close()
        if threshold and num > threshold:
            break
    f.close()

if __name__ == '__main__':
    import optparse
    parser = optparse.OptionParser(usage='%prog [opts] filename', version='1.0')
    parser.add_option('-t', '--threshold', help='maximum number of image files to extract')
    opts, args = parser.parse_args()
    threshold = int(opts.threshold) if opts.threshold else None
    find_all_images(args[0], threshold)
    
