#---------------------------------------------------------------------------
# OSM_Extract.py
#
# Extracts a BBOX footprint from a planet file. Like Frederick Ramm's
# history-extract.pl except that it works against the compressed .bz2
# It doesn't care about end-of-lines.
#
# That is, you can use it against the full-planet.bz2 file.
#
# Writes directly to a .bz2 file so disk space usage is minimized.
#
# Unlike other extract tools, it also keeps changesets.
#
# Uses multiple passes to get everything:
#
# Pass 1:
#   Scan through all nodes,
#     - Make a list of nodes in BBOX
#     - Note changesets for each node
#   Scan through all ways
#     - Make a list of ways having at least one node in list
#     - Note changesets for each way
#   Scan through all relations
#     - Make a list of relations having at least one node or way in lists
#     - Note changesets for each relation
#
# Pass 2:
#   Copy listed changesets to new file
#   Copy listed nodes to new file
#   Copy listed ways to new file
#   Copy listed relations to new file
#
# Of course, all tags for each object are also copied.
#
# Since this is designed to work with historical data, it tends to grab more than
# it needs. Specifically, old, deleted nodes will cause ways and relations to be
# included (etc.). It does not resolve missing objects though (i.e., nodes outside
# the BBOX that are in an included way).
#
#---------------------------------------------------------------------------
#   Name:       OSM Extract
#   Version:    1.0
#   Authored    By: Eric Wolf
#   Copyright:  Public Domain.
#---------------------------------------------------------------------------

# Command line parameters
# D:\GNIS_OSM\rhode_island.osm.bz2 D:\GNIS_OSM\ri_extract.osm.bz2 -72 -71 42 41
# D:\GNIS_OSM\djibouti.osm.bz2 D:\GNIS_OSM\dj_extract.osm.bz2 42.7 42.8 11.6 11.5
# D:\GNIS_OSM\full-planet-110115-1800.osm.bz2 D:\GNIS_OSM\full_extract.osm.bz2 -72 -71 42 41


# Import modules
import sys, os
import fileinput, bz2
import math, time

# BigXMLfile
#
# Reads bz2 compressed OSM XML file
#
# Does not decompress the file.
# Does not use "readline" or the equivalent.
# So it works with full-planet.osm.bz2
#
# Probably need to make this a separate module
#
class BigXMLfile:
    def __init__(self, filename):
        self.name = filename
        self.fp = bz2.BZ2File(filename,'rb',16384*64)
        self.buffsize = 16384 # How many bytes to keep around in the buffer
        self.bufpos = 0
        self.buffer = self.fp.read(self.buffsize)
        self.bread = len(self.buffer)
        
    def getXMLtag(self):
        # find the close bracket
        cb = self.buffer.find('>', self.bufpos)
        
        # Hit the end of the buffer, need to reload
        if cb < 0:

            # Read in anohter chunk of the file
            # NOTE: It's possible that an XML tag will be greater than buffsize
            #       This will break in that situation
            newb = self.fp.read(self.bufpos)
            
            # Hit the end of the file, need to return nada
            if len(newb) == 0:
                return ''

            self.bread = self.bread + len(newb)
            
            # Copy the end of the buffer to head, tack on the new stuff
            self.buffer = self.buffer[self.bufpos:]+newb
            
            self.bufpos = 0
            
            # Check again for the close bracket
            cb = self.buffer.find('>', self.bufpos)
            
            if cb < 0:
                return ''

        # Pick out the tag and clean it up                
        tag = self.buffer[self.bufpos:cb+1].strip()

        # shift our buffer pointer up        
        self.bufpos = cb + 1

        # Very rare - happens if '>' is last character in buffer
        if self.bufpos >= self.buffsize:
            self.buffer = self.fp.read(self.buffsize)
            self.bufpos = 0
        
        # Source should be in utf-8, but sometimes its not.
        tag = unicode(tag, "utf-8", "replace")
        
        return tag
    
    def getBread(self):
        return self.bread
    
# class BigXMLfile    

#---------------------------------------------------------------------------
#Gets the XML element name from the string passed in
#an end of element tag is /element
#---------------------------------------------------------------------------
def getElement(line):
    s=line.find('<')
    e=line.find(' ',s)
    el=line[s+1:e]
    if el[0:1]=='/':
        el=el[0:len(el)]  # was len(el) - 1 
    return el
#---------------------------------------------------------------------------


#---------------------------------------------------------------------------
#Gets the value of the named attribute from the string
#---------------------------------------------------------------------------
def getAttributeValue(name,line):
    sa=line.find(' '+name+'="')+len(name)+3
    ea=line.find('"',sa)
    attr=line[sa:ea]

    return attr
#---------------------------------------------------------------------------    

#---------------------------------------------------------------------------
#Extract Node attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnNode(line):
    #<node id="38708798" version="1" timestamp="2007-09-02T03:45:45Z" uid="12818" user="Andreas Kloeckner" changeset="293802" lat="41.8124909" lon="-71.3598665"/>
    nid=getAttributeValue('id',line)
    nver=getAttributeValue('version',line)
    t=getAttributeValue('timestamp',line)
    s='/'
    #                               01234567890123456789
    # Timestamp comes in like this: 2011-01-25T19:13:46Z
    # Needs to go out like this: 01/25/2011 07:13:46 PM
    m = ' AM'
    h = int(t[11:13])
    if h > 12:
        m = ' PM'
        h = h - 12
    
    nts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
           
    nuid=getAttributeValue('uid',line)
    nuser=getAttributeValue('user',line)
    ncs=getAttributeValue('changeset',line)
    nx=getAttributeValue('lon',line)
    ny=getAttributeValue('lat',line)

    return(nid,nx,ny,nver,nts,nuid,nuser,ncs)
#---------------------------------------------------------------------------

#---------------------------------------------------------------------------
#Extract Relation attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnRelation(line):
    #<node id="38708798" version="1" timestamp="2007-09-02T03:45:45Z" uid="12818" user="Andreas Kloeckner" changeset="293802" lat="41.8124909" lon="-71.3598665"/>
    #<relation id="60848" version="13" timestamp="2010-12-30T09:32:17Z" uid="207745" user="NE2" changeset="6805641">
    rid=getAttributeValue('id',line)
    rver=getAttributeValue('version',line)
    t=getAttributeValue('timestamp',line)
    s='/'
    #                               01234567890123456789
    # Timestamp comes in like this: 2011-01-25T19:13:46Z
    # Needs to go out like this: 01/25/2011 07:13:46 PM
    m = ' AM'
    h = int(t[11:13])
    if h > 12:
        m = ' PM'
        h = h - 12
    
    rts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
           
    ruid=getAttributeValue('uid',line)
    ruser=getAttributeValue('user',line)
    rcs=getAttributeValue('changeset',line)

    #       0    1   2    3    4    5     
    return(rid,rver,rts,ruid,ruser,rcs)
#---------------------------------------------------------------------------

#---------------------------------------------------------------------------
#Extract Way attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnWay(line):
    #<way id="96685884" version="1" timestamp="2011-01-25T09:14:13Z" uid="2318" user="Latze" changeset="7081537">
    wid=getAttributeValue('id',line)
    wver=getAttributeValue('version',line)
    t=getAttributeValue('timestamp',line)
    s='/'
    #                               01234567890123456789
    # Timestamp comes in like this: 2011-01-25T19:13:46Z
    # Needs to go out like this: 01/25/2011 07:13:46 PM
    m = ' AM'
    h = int(t[11:13])
    if h > 12:
        m = ' PM'
        h = h - 12
    
    wts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
           
    wuid=getAttributeValue('uid',line)
    wuser=getAttributeValue('user',line)
    wcs=getAttributeValue('changeset',line)

    #       0    1   2    3    4    5     
    return(wid,wver,wts,wuid,wuser,wcs)
#---------------------------------------------------------------------------

    
#---------------------------------------------------------------------------
#get the id attribute from a line of xml text
#used for ways and its segs, as id is only attribute needed
#---------------------------------------------------------------------------
def returnID(line):
    return getAttributeValue('id', line)
#---------------------------------------------------------------------------

#---------------------------------------------------------------------------
def returnTags(line):
    tmp = getAttributeValue('k',line).lstrip().encode("Latin-1","replace")[:29]
    # Standard fields can't have certain characters in the key name (like gnis:id)
    k = tmp.replace(':','_')
    v = getAttributeValue('v',line).encode("Latin-1","replace")[:254]
    return(k,v)
#---------------------------------------------------------------------------



# Resolve: Make this a little more flexible
try:
    inFile = str(sys.argv[1])
    outfile = str(sys.argv[2])
    bbox_left = float(sys.argv[3])
    bbox_right = float(sys.argv[4])
    bbox_top = float(sys.argv[5])
    bbox_bottom = float(sys.argv[6])

except:
    print "Input parameters are incorrect\n\n"
    print "Usage:\n"
    print "  osm_extract.py input_file.osm.bz2 output_file.osm.bz2 left right top bottom\n"
    sys.exit(-1)


start = time.clock()

# Do we want relations?
include_relations = True

# Do we want to resolve ways
resolve_ways = True

node_list = set()
way_list = set()
relation_list = set()
changeset_list = set()

#
# Processing flags
#

# Enable/disable the use of bz2 compression for temp files
useBZ2_temp_files = False

# Enable/disable deleting temp files (for debugging)
delete_temp_files = False

total_node_count = 0
total_tag_count = 0
total_way_count = 0
total_way_node_count = 0
total_rel_count = 0
total_rel_mem_count = 0
keep_node_count = 0
keep_way_count = 0
keep_way_node_count = 0
keep_rel_count = 0
keep_rel_mem_count = 0
keep_tag_count = 0

this_tag_count = 0
this_rel_mem_count = 0

min_node_id = 100000000
max_node_id = 0
min_way_node_id = 100000000
max_way_node_id = 0

keep_relation = False
keep_node = False
keep_way = False


#
# Step 1: Scan input file, build lists
#
try:
    # Input is always a BZ2 compressed file
    inputfile = BigXMLfile(inFile)

    print "Step 1: List nodes in BBOX"

    line_count=0

    xml_tag_bytes = 0
    total_bytes_to_write = 0
   
    #for uline in inputfile:
    while True:
    
        # Read one XML tag without depending on line breaks 
        # (so this works with history files)
        line = inputfile.getXMLtag()
    
        if line == '':
            break
    
        line_count += 1
        xml_tag_bytes += len(line)

        if (line_count % 250000) == 0:
            print "Processed " + str(line_count) + " lines."
        
        if line[1] == '/':
            element = line[1:line.find('>',1)]
        else:
            element = line[1:line.find(' ',1)]

        # Do differents based on the element type (node/way/relation)
        if element == 'node':
            #node = returnNode(line)

            s = line.find('id="',5) + 4
            e = line.find('"',s)
            node_id = int(line[s:e])
            
            if node_id > max_node_id:
                max_node_id = node_id

            if node_id < min_node_id:
                min_node_id = node_id
            
            s = line.find('lat="',5) + 5
            e = line.find('"',s)
            lat = float(line[s:e])
            
            # Is the node in the BBOX?
            if lat < bbox_bottom or lat > bbox_top:
                continue

            s = line.find('lon="',5) + 5
            e = line.find('"',s)
            long = float(line[s:e])
            
            # Is the node in the BBOX?
            if long < bbox_left or long > bbox_right:
                continue

            keep_node = True
            node_list.add(node_id)
            
            s = line.find('changeset="',5) + 11
            e = line.find('"',s)
            cs = int(line[s:e])
            changeset_list.add(cs)

        # End of node - ignored for now
        elif element == '/node':
            total_node_count += 1

            if keep_node:
                total_bytes_to_write += xml_tag_bytes
                xml_tag_bytes = 0
                keep_node = False
                keep_node_count += 1
                keep_tag_count += this_tag_count
                total_bytes_to_write += xml_tag_bytes
            
            xml_tag_bytes = 0
            this_tag_count = 0
            
        # Start of way
        elif element == 'way':
            s = line.find('id="',4) + 4
            e = line.find('"',s)
            way_id = int(line[s:e])
            
            s = line.find('changeset="',5) + 11
            e = line.find('"',s)
            way_cs = int(line[s:e])

            way_nodes = set()
            keep_way = False
                        
        # Way nodes
        elif element == 'nd':
            total_way_node_count += 1

            s = line.find('ref="',1) + 5
            e = line.find('"',s)
            node_id = int(line[s:e])
            way_nodes.add(node_id)
            
            if node_id in node_list:
                keep_way = True
                
                if node_id > max_way_node_id:
                    max_way_node_id = node_id

                if node_id < min_way_node_id:
                    min_way_node_id = node_id
                
        # End of way - check if we keep it
        elif element == '/way':
            total_way_count += 1
            
            if keep_way:
                way_list.add(way_id)

                changeset_list.add(way_cs)

                # This adds nodes not in BBOX but part of way that intersects it
                # This really slows things down!
                if resolve_ways:
                    node_list.update(way_nodes)
                    keep_way_node_count += len(way_nodes)
                    
                keep_tag_count += this_tag_count
                keep_way_count += 1

                total_bytes_to_write += xml_tag_bytes

            total_tag_count += this_tag_count                        
            this_tag_count = 0
            
            keep_way = False
            xml_tag_bytes = 0
                
        # Tags are ignored right now
        elif element == 'tag':
            this_tag_count += 1

        elif element == 'relation' and include_relations:
            s = line.find('id="',4) + 4
            e = line.find('"',s)
            rel_id = int(line[s:e])
            
            s = line.find('changeset="',5) + 11
            e = line.find('"',s)
            rel_cs = int(line[s:e])

            relation_ways = set()
            relation_nodes = set()

            keep_relation = False

        elif element == 'member':
            this_rel_mem_count += 1

            if include_relations:
                s = line.find('type="',1) + 6
                e = line.find('"',s)
                memtype = line[s:e]
                
                s = line.find('ref="',1) + 5
                e = line.find('"',s)
                member = int(line[s:e])

                if memtype == 'way':
                    relation_ways.add(member)
                    if member in way_list:
                        keep_relation = True
                else:
                    relation_nodes.add(member)
                    if member in node_list:
                        keep_relation = True

        elif element == '/relation':
            if keep_relation:
                relation_list.add(rel_id)
                changeset_list.add(rel_cs)
                node_list.update(relation_nodes)
                way_list.update(relation_ways)
                keep_rel_count += 1
                keep_rel_mem_count += this_rel_mem_count
                total_bytes_to_write += xml_tag_bytes
                keep_tag_count += this_tag_count

            total_rel_count += 1

            total_rel_mem_count += this_rel_mem_count
            this_rel_mem_count = 0

            total_tag_count += this_tag_count
            this_tag_count = 0
            xml_tag_bytes = 0

            keep_relation = False

        # if element==...
        
    # while True:


    print 'Bytes read from OSM file: ' + str(inputfile.getBread())
    print 'Bytes to be written: ' + str(total_bytes_to_write)
    print "Lines/Elements Scanned: " + str(line_count) + " lines"
    print "Total Nodes: " + str(total_node_count)
    print "Kept Nodes: " + str(keep_node_count)
    print "Total Ways: " + str(total_way_count)
    print "Kept Ways: " + str(keep_way_count)
    print "Total Way Nodes: " + str(total_way_node_count)
    print "Kept Way Nodes: " + str(keep_way_node_count)
    print "Total Relations: " + str(total_rel_count)
    print "Kept Relations: " + str(keep_rel_count)
    print "Total Relation Members:" + str(total_rel_mem_count)
    print "Kept Relation Members:" + str(total_rel_mem_count)
    print "Total Tags: " + str(total_tag_count)
    print "Kept Tags: " + str(keep_tag_count)

    print "Changeset list count: " + str(len(changeset_list))
    print "Node list count: " + str(len(node_list))
    print "Way list count: " + str(len(way_list))
    print "Relation list count: " + str(len(relation_list))
    
    print "Node Ids, min("+str(min_node_id)+"),max("+str(max_node_id)+")"
    print "Way Node Ids, min("+str(min_way_node_id)+"),max("+str(max_way_node_id)+")"

except Exception, ErrorDesc:
    print "Step 1 Failed : " + str(ErrorDesc)
    finish = time.clock()
    print "Extract incomplete in " + str(finish - start) + " seconds."
    sys.exit(-2)

finish = time.clock()
print "Extract complete in " + str(finish - start) + " seconds."