#--------------------------------------------------------------------------- # OSM_Extract.py # # Extracts a BBOX footprint from a planet file. Like Frederick Ramm's # history-extract.pl except that it works against the compressed .bz2 # It doesn't care about end-of-lines. # # That is, you can use it against the full-planet.bz2 file. # # Writes directly to a .bz2 file so disk space usage is minimized. # # Unlike other extract tools, it also keeps changesets. # # Uses multiple passes to get everything: # # Pass 1: # Scan through all nodes, # - Make a list of nodes in BBOX # - Note changesets for each node # Scan through all ways # - Make a list of ways having at least one node in list # - Note changesets for each way # Scan through all relations # - Make a list of relations having at least one node or way in lists # - Note changesets for each relation # # Pass 2: # Copy listed changesets to new file # Copy listed nodes to new file # Copy listed ways to new file # Copy listed relations to new file # # Of course, all tags for each object are also copied. # # Since this is designed to work with historical data, it tends to grab more than # it needs. Specifically, old, deleted nodes will cause ways and relations to be # included (etc.). It does not resolve missing objects though (i.e., nodes outside # the BBOX that are in an included way). # #--------------------------------------------------------------------------- # Name: OSM Extract # Version: 1.0 # Authored By: Eric Wolf # Copyright: Public Domain. #--------------------------------------------------------------------------- # Command line parameters # D:\GNIS_OSM\rhode_island.osm.bz2 D:\GNIS_OSM\ri_extract.osm.bz2 -72 -71 42 41 # D:\GNIS_OSM\djibouti.osm.bz2 D:\GNIS_OSM\dj_extract.osm.bz2 42.7 42.8 11.6 11.5 # D:\GNIS_OSM\full-planet-110115-1800.osm.bz2 D:\GNIS_OSM\full_extract.osm.bz2 -72 -71 42 41 # Import modules import sys, os import fileinput, bz2 import math, time # BigXMLfile # # Reads bz2 compressed OSM XML file # # Does not decompress the file. # Does not use "readline" or the equivalent. # So it works with full-planet.osm.bz2 # # Probably need to make this a separate module # class BigXMLfile: def __init__(self, filename): self.name = filename self.fp = bz2.BZ2File(filename,'rb',16384*64) self.buffsize = 16384 # How many bytes to keep around in the buffer self.bufpos = 0 self.buffer = self.fp.read(self.buffsize) self.bread = len(self.buffer) def getXMLtag(self): # find the close bracket cb = self.buffer.find('>', self.bufpos) # Hit the end of the buffer, need to reload if cb < 0: # Read in anohter chunk of the file # NOTE: It's possible that an XML tag will be greater than buffsize # This will break in that situation newb = self.fp.read(self.bufpos) # Hit the end of the file, need to return nada if len(newb) == 0: return '' self.bread = self.bread + len(newb) # Copy the end of the buffer to head, tack on the new stuff self.buffer = self.buffer[self.bufpos:]+newb self.bufpos = 0 # Check again for the close bracket cb = self.buffer.find('>', self.bufpos) if cb < 0: return '' # Pick out the tag and clean it up tag = self.buffer[self.bufpos:cb+1].strip() # shift our buffer pointer up self.bufpos = cb + 1 # Very rare - happens if '>' is last character in buffer if self.bufpos >= self.buffsize: self.buffer = self.fp.read(self.buffsize) self.bufpos = 0 # Source should be in utf-8, but sometimes its not. tag = unicode(tag, "utf-8", "replace") return tag def getBread(self): return self.bread # class BigXMLfile #--------------------------------------------------------------------------- #Gets the XML element name from the string passed in #an end of element tag is /element #--------------------------------------------------------------------------- def getElement(line): s=line.find('<') e=line.find(' ',s) el=line[s+1:e] if el[0:1]=='/': el=el[0:len(el)] # was len(el) - 1 return el #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #Gets the value of the named attribute from the string #--------------------------------------------------------------------------- def getAttributeValue(name,line): sa=line.find(' '+name+'="')+len(name)+3 ea=line.find('"',sa) attr=line[sa:ea] return attr #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #Extract Node attribute details from a line of xml text #--------------------------------------------------------------------------- def returnNode(line): # nid=getAttributeValue('id',line) nver=getAttributeValue('version',line) t=getAttributeValue('timestamp',line) s='/' # 01234567890123456789 # Timestamp comes in like this: 2011-01-25T19:13:46Z # Needs to go out like this: 01/25/2011 07:13:46 PM m = ' AM' h = int(t[11:13]) if h > 12: m = ' PM' h = h - 12 nts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m nuid=getAttributeValue('uid',line) nuser=getAttributeValue('user',line) ncs=getAttributeValue('changeset',line) nx=getAttributeValue('lon',line) ny=getAttributeValue('lat',line) return(nid,nx,ny,nver,nts,nuid,nuser,ncs) #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #Extract Relation attribute details from a line of xml text #--------------------------------------------------------------------------- def returnRelation(line): # # rid=getAttributeValue('id',line) rver=getAttributeValue('version',line) t=getAttributeValue('timestamp',line) s='/' # 01234567890123456789 # Timestamp comes in like this: 2011-01-25T19:13:46Z # Needs to go out like this: 01/25/2011 07:13:46 PM m = ' AM' h = int(t[11:13]) if h > 12: m = ' PM' h = h - 12 rts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m ruid=getAttributeValue('uid',line) ruser=getAttributeValue('user',line) rcs=getAttributeValue('changeset',line) # 0 1 2 3 4 5 return(rid,rver,rts,ruid,ruser,rcs) #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #Extract Way attribute details from a line of xml text #--------------------------------------------------------------------------- def returnWay(line): # wid=getAttributeValue('id',line) wver=getAttributeValue('version',line) t=getAttributeValue('timestamp',line) s='/' # 01234567890123456789 # Timestamp comes in like this: 2011-01-25T19:13:46Z # Needs to go out like this: 01/25/2011 07:13:46 PM m = ' AM' h = int(t[11:13]) if h > 12: m = ' PM' h = h - 12 wts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m wuid=getAttributeValue('uid',line) wuser=getAttributeValue('user',line) wcs=getAttributeValue('changeset',line) # 0 1 2 3 4 5 return(wid,wver,wts,wuid,wuser,wcs) #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #get the id attribute from a line of xml text #used for ways and its segs, as id is only attribute needed #--------------------------------------------------------------------------- def returnID(line): return getAttributeValue('id', line) #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- def returnTags(line): tmp = getAttributeValue('k',line).lstrip().encode("Latin-1","replace")[:29] # Standard fields can't have certain characters in the key name (like gnis:id) k = tmp.replace(':','_') v = getAttributeValue('v',line).encode("Latin-1","replace")[:254] return(k,v) #--------------------------------------------------------------------------- # Resolve: Make this a little more flexible try: inFile = str(sys.argv[1]) outfile = str(sys.argv[2]) bbox_left = float(sys.argv[3]) bbox_right = float(sys.argv[4]) bbox_top = float(sys.argv[5]) bbox_bottom = float(sys.argv[6]) except: print "Input parameters are incorrect\n\n" print "Usage:\n" print " osm_extract.py input_file.osm.bz2 output_file.osm.bz2 left right top bottom\n" sys.exit(-1) start = time.clock() # Do we want relations? include_relations = True # Do we want to resolve ways resolve_ways = True node_list = set() way_list = set() relation_list = set() changeset_list = set() # # Processing flags # # Enable/disable the use of bz2 compression for temp files useBZ2_temp_files = False # Enable/disable deleting temp files (for debugging) delete_temp_files = False total_node_count = 0 total_tag_count = 0 total_way_count = 0 total_way_node_count = 0 total_rel_count = 0 total_rel_mem_count = 0 keep_node_count = 0 keep_way_count = 0 keep_way_node_count = 0 keep_rel_count = 0 keep_rel_mem_count = 0 keep_tag_count = 0 this_tag_count = 0 this_rel_mem_count = 0 min_node_id = 100000000 max_node_id = 0 min_way_node_id = 100000000 max_way_node_id = 0 keep_relation = False keep_node = False keep_way = False # # Step 1: Scan input file, build lists # try: # Input is always a BZ2 compressed file inputfile = BigXMLfile(inFile) print "Step 1: List nodes in BBOX" line_count=0 xml_tag_bytes = 0 total_bytes_to_write = 0 #for uline in inputfile: while True: # Read one XML tag without depending on line breaks # (so this works with history files) line = inputfile.getXMLtag() if line == '': break line_count += 1 xml_tag_bytes += len(line) if (line_count % 250000) == 0: print "Processed " + str(line_count) + " lines." if line[1] == '/': element = line[1:line.find('>',1)] else: element = line[1:line.find(' ',1)] # Do differents based on the element type (node/way/relation) if element == 'node': #node = returnNode(line) s = line.find('id="',5) + 4 e = line.find('"',s) node_id = int(line[s:e]) if node_id > max_node_id: max_node_id = node_id if node_id < min_node_id: min_node_id = node_id s = line.find('lat="',5) + 5 e = line.find('"',s) lat = float(line[s:e]) # Is the node in the BBOX? if lat < bbox_bottom or lat > bbox_top: continue s = line.find('lon="',5) + 5 e = line.find('"',s) long = float(line[s:e]) # Is the node in the BBOX? if long < bbox_left or long > bbox_right: continue keep_node = True node_list.add(node_id) s = line.find('changeset="',5) + 11 e = line.find('"',s) cs = int(line[s:e]) changeset_list.add(cs) # End of node - ignored for now elif element == '/node': total_node_count += 1 if keep_node: total_bytes_to_write += xml_tag_bytes xml_tag_bytes = 0 keep_node = False keep_node_count += 1 keep_tag_count += this_tag_count total_bytes_to_write += xml_tag_bytes xml_tag_bytes = 0 this_tag_count = 0 # Start of way elif element == 'way': s = line.find('id="',4) + 4 e = line.find('"',s) way_id = int(line[s:e]) s = line.find('changeset="',5) + 11 e = line.find('"',s) way_cs = int(line[s:e]) way_nodes = set() keep_way = False # Way nodes elif element == 'nd': total_way_node_count += 1 s = line.find('ref="',1) + 5 e = line.find('"',s) node_id = int(line[s:e]) way_nodes.add(node_id) if node_id in node_list: keep_way = True if node_id > max_way_node_id: max_way_node_id = node_id if node_id < min_way_node_id: min_way_node_id = node_id # End of way - check if we keep it elif element == '/way': total_way_count += 1 if keep_way: way_list.add(way_id) changeset_list.add(way_cs) # This adds nodes not in BBOX but part of way that intersects it # This really slows things down! if resolve_ways: node_list.update(way_nodes) keep_way_node_count += len(way_nodes) keep_tag_count += this_tag_count keep_way_count += 1 total_bytes_to_write += xml_tag_bytes total_tag_count += this_tag_count this_tag_count = 0 keep_way = False xml_tag_bytes = 0 # Tags are ignored right now elif element == 'tag': this_tag_count += 1 elif element == 'relation' and include_relations: s = line.find('id="',4) + 4 e = line.find('"',s) rel_id = int(line[s:e]) s = line.find('changeset="',5) + 11 e = line.find('"',s) rel_cs = int(line[s:e]) relation_ways = set() relation_nodes = set() keep_relation = False elif element == 'member': this_rel_mem_count += 1 if include_relations: s = line.find('type="',1) + 6 e = line.find('"',s) memtype = line[s:e] s = line.find('ref="',1) + 5 e = line.find('"',s) member = int(line[s:e]) if memtype == 'way': relation_ways.add(member) if member in way_list: keep_relation = True else: relation_nodes.add(member) if member in node_list: keep_relation = True elif element == '/relation': if keep_relation: relation_list.add(rel_id) changeset_list.add(rel_cs) node_list.update(relation_nodes) way_list.update(relation_ways) keep_rel_count += 1 keep_rel_mem_count += this_rel_mem_count total_bytes_to_write += xml_tag_bytes keep_tag_count += this_tag_count total_rel_count += 1 total_rel_mem_count += this_rel_mem_count this_rel_mem_count = 0 total_tag_count += this_tag_count this_tag_count = 0 xml_tag_bytes = 0 keep_relation = False # if element==... # while True: print 'Bytes read from OSM file: ' + str(inputfile.getBread()) print 'Bytes to be written: ' + str(total_bytes_to_write) print "Lines/Elements Scanned: " + str(line_count) + " lines" print "Total Nodes: " + str(total_node_count) print "Kept Nodes: " + str(keep_node_count) print "Total Ways: " + str(total_way_count) print "Kept Ways: " + str(keep_way_count) print "Total Way Nodes: " + str(total_way_node_count) print "Kept Way Nodes: " + str(keep_way_node_count) print "Total Relations: " + str(total_rel_count) print "Kept Relations: " + str(keep_rel_count) print "Total Relation Members:" + str(total_rel_mem_count) print "Kept Relation Members:" + str(total_rel_mem_count) print "Total Tags: " + str(total_tag_count) print "Kept Tags: " + str(keep_tag_count) print "Changeset list count: " + str(len(changeset_list)) print "Node list count: " + str(len(node_list)) print "Way list count: " + str(len(way_list)) print "Relation list count: " + str(len(relation_list)) print "Node Ids, min("+str(min_node_id)+"),max("+str(max_node_id)+")" print "Way Node Ids, min("+str(min_way_node_id)+"),max("+str(max_way_node_id)+")" except Exception, ErrorDesc: print "Step 1 Failed : " + str(ErrorDesc) finish = time.clock() print "Extract incomplete in " + str(finish - start) + " seconds." sys.exit(-2) finish = time.clock() print "Extract complete in " + str(finish - start) + " seconds."