#---------------------------------------------------------------------------
# OSM_Extract.py
#
# Extracts a BBOX footprint from a planet file. Like Frederick Ramm's
# history-extract.pl except that it works against the compressed .bz2
# It doesn't care about end-of-lines.
#
# That is, you can use it against the full-planet.bz2 file.
#
# Writes directly to a .bz2 file so disk space usage is minimized.
#
# Unlike other extract tools, it also keeps changesets.
#
# Uses multiple passes to get everything:
#
# Pass 1:
# Scan through all nodes,
# - Make a list of nodes in BBOX
# - Note changesets for each node
# Scan through all ways
# - Make a list of ways having at least one node in list
# - Note changesets for each way
# Scan through all relations
# - Make a list of relations having at least one node or way in lists
# - Note changesets for each relation
#
# Pass 2:
# Copy listed changesets to new file
# Copy listed nodes to new file
# Copy listed ways to new file
# Copy listed relations to new file
#
# Of course, all tags for each object are also copied.
#
# Since this is designed to work with historical data, it tends to grab more than
# it needs. Specifically, old, deleted nodes will cause ways and relations to be
# included (etc.). It does not resolve missing objects though (i.e., nodes outside
# the BBOX that are in an included way).
#
#---------------------------------------------------------------------------
# Name: OSM Extract
# Version: 1.0
# Authored By: Eric Wolf
# Copyright: Public Domain.
#---------------------------------------------------------------------------
# Command line parameters
# D:\GNIS_OSM\rhode_island.osm.bz2 D:\GNIS_OSM\ri_extract.osm.bz2 -72 -71 42 41
# D:\GNIS_OSM\djibouti.osm.bz2 D:\GNIS_OSM\dj_extract.osm.bz2 42.7 42.8 11.6 11.5
# D:\GNIS_OSM\full-planet-110115-1800.osm.bz2 D:\GNIS_OSM\full_extract.osm.bz2 -72 -71 42 41
# Import modules
import sys, os
import fileinput, bz2
import math, time
# BigXMLfile
#
# Reads bz2 compressed OSM XML file
#
# Does not decompress the file.
# Does not use "readline" or the equivalent.
# So it works with full-planet.osm.bz2
#
# Probably need to make this a separate module
#
class BigXMLfile:
def __init__(self, filename):
self.name = filename
self.fp = bz2.BZ2File(filename,'rb',16384*64)
self.buffsize = 16384 # How many bytes to keep around in the buffer
self.bufpos = 0
self.buffer = self.fp.read(self.buffsize)
self.bread = len(self.buffer)
def getXMLtag(self):
# find the close bracket
cb = self.buffer.find('>', self.bufpos)
# Hit the end of the buffer, need to reload
if cb < 0:
# Read in anohter chunk of the file
# NOTE: It's possible that an XML tag will be greater than buffsize
# This will break in that situation
newb = self.fp.read(self.bufpos)
# Hit the end of the file, need to return nada
if len(newb) == 0:
return ''
self.bread = self.bread + len(newb)
# Copy the end of the buffer to head, tack on the new stuff
self.buffer = self.buffer[self.bufpos:]+newb
self.bufpos = 0
# Check again for the close bracket
cb = self.buffer.find('>', self.bufpos)
if cb < 0:
return ''
# Pick out the tag and clean it up
tag = self.buffer[self.bufpos:cb+1].strip()
# shift our buffer pointer up
self.bufpos = cb + 1
# Very rare - happens if '>' is last character in buffer
if self.bufpos >= self.buffsize:
self.buffer = self.fp.read(self.buffsize)
self.bufpos = 0
# Source should be in utf-8, but sometimes its not.
tag = unicode(tag, "utf-8", "replace")
return tag
def getBread(self):
return self.bread
# class BigXMLfile
#---------------------------------------------------------------------------
#Gets the XML element name from the string passed in
#an end of element tag is /element
#---------------------------------------------------------------------------
def getElement(line):
s=line.find('<')
e=line.find(' ',s)
el=line[s+1:e]
if el[0:1]=='/':
el=el[0:len(el)] # was len(el) - 1
return el
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#Gets the value of the named attribute from the string
#---------------------------------------------------------------------------
def getAttributeValue(name,line):
sa=line.find(' '+name+'="')+len(name)+3
ea=line.find('"',sa)
attr=line[sa:ea]
return attr
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#Extract Node attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnNode(line):
#
nid=getAttributeValue('id',line)
nver=getAttributeValue('version',line)
t=getAttributeValue('timestamp',line)
s='/'
# 01234567890123456789
# Timestamp comes in like this: 2011-01-25T19:13:46Z
# Needs to go out like this: 01/25/2011 07:13:46 PM
m = ' AM'
h = int(t[11:13])
if h > 12:
m = ' PM'
h = h - 12
nts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
nuid=getAttributeValue('uid',line)
nuser=getAttributeValue('user',line)
ncs=getAttributeValue('changeset',line)
nx=getAttributeValue('lon',line)
ny=getAttributeValue('lat',line)
return(nid,nx,ny,nver,nts,nuid,nuser,ncs)
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#Extract Relation attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnRelation(line):
#
#
rid=getAttributeValue('id',line)
rver=getAttributeValue('version',line)
t=getAttributeValue('timestamp',line)
s='/'
# 01234567890123456789
# Timestamp comes in like this: 2011-01-25T19:13:46Z
# Needs to go out like this: 01/25/2011 07:13:46 PM
m = ' AM'
h = int(t[11:13])
if h > 12:
m = ' PM'
h = h - 12
rts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
ruid=getAttributeValue('uid',line)
ruser=getAttributeValue('user',line)
rcs=getAttributeValue('changeset',line)
# 0 1 2 3 4 5
return(rid,rver,rts,ruid,ruser,rcs)
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#Extract Way attribute details from a line of xml text
#---------------------------------------------------------------------------
def returnWay(line):
#
wid=getAttributeValue('id',line)
wver=getAttributeValue('version',line)
t=getAttributeValue('timestamp',line)
s='/'
# 01234567890123456789
# Timestamp comes in like this: 2011-01-25T19:13:46Z
# Needs to go out like this: 01/25/2011 07:13:46 PM
m = ' AM'
h = int(t[11:13])
if h > 12:
m = ' PM'
h = h - 12
wts = t[5:7]+s+t[8:10]+s+t[0:4]+' '+"%02d"%h+t[13:19]+m
wuid=getAttributeValue('uid',line)
wuser=getAttributeValue('user',line)
wcs=getAttributeValue('changeset',line)
# 0 1 2 3 4 5
return(wid,wver,wts,wuid,wuser,wcs)
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
#get the id attribute from a line of xml text
#used for ways and its segs, as id is only attribute needed
#---------------------------------------------------------------------------
def returnID(line):
return getAttributeValue('id', line)
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
def returnTags(line):
tmp = getAttributeValue('k',line).lstrip().encode("Latin-1","replace")[:29]
# Standard fields can't have certain characters in the key name (like gnis:id)
k = tmp.replace(':','_')
v = getAttributeValue('v',line).encode("Latin-1","replace")[:254]
return(k,v)
#---------------------------------------------------------------------------
# Resolve: Make this a little more flexible
try:
inFile = str(sys.argv[1])
outfile = str(sys.argv[2])
bbox_left = float(sys.argv[3])
bbox_right = float(sys.argv[4])
bbox_top = float(sys.argv[5])
bbox_bottom = float(sys.argv[6])
except:
print "Input parameters are incorrect\n\n"
print "Usage:\n"
print " osm_extract.py input_file.osm.bz2 output_file.osm.bz2 left right top bottom\n"
sys.exit(-1)
start = time.clock()
# Do we want relations?
include_relations = True
# Do we want to resolve ways
resolve_ways = True
node_list = set()
way_list = set()
relation_list = set()
changeset_list = set()
#
# Processing flags
#
# Enable/disable the use of bz2 compression for temp files
useBZ2_temp_files = False
# Enable/disable deleting temp files (for debugging)
delete_temp_files = False
total_node_count = 0
total_tag_count = 0
total_way_count = 0
total_way_node_count = 0
total_rel_count = 0
total_rel_mem_count = 0
keep_node_count = 0
keep_way_count = 0
keep_way_node_count = 0
keep_rel_count = 0
keep_rel_mem_count = 0
keep_tag_count = 0
this_tag_count = 0
this_rel_mem_count = 0
min_node_id = 100000000
max_node_id = 0
min_way_node_id = 100000000
max_way_node_id = 0
keep_relation = False
keep_node = False
keep_way = False
#
# Step 1: Scan input file, build lists
#
try:
# Input is always a BZ2 compressed file
inputfile = BigXMLfile(inFile)
print "Step 1: List nodes in BBOX"
line_count=0
xml_tag_bytes = 0
total_bytes_to_write = 0
#for uline in inputfile:
while True:
# Read one XML tag without depending on line breaks
# (so this works with history files)
line = inputfile.getXMLtag()
if line == '':
break
line_count += 1
xml_tag_bytes += len(line)
if (line_count % 250000) == 0:
print "Processed " + str(line_count) + " lines."
if line[1] == '/':
element = line[1:line.find('>',1)]
else:
element = line[1:line.find(' ',1)]
# Do differents based on the element type (node/way/relation)
if element == 'node':
#node = returnNode(line)
s = line.find('id="',5) + 4
e = line.find('"',s)
node_id = int(line[s:e])
if node_id > max_node_id:
max_node_id = node_id
if node_id < min_node_id:
min_node_id = node_id
s = line.find('lat="',5) + 5
e = line.find('"',s)
lat = float(line[s:e])
# Is the node in the BBOX?
if lat < bbox_bottom or lat > bbox_top:
continue
s = line.find('lon="',5) + 5
e = line.find('"',s)
long = float(line[s:e])
# Is the node in the BBOX?
if long < bbox_left or long > bbox_right:
continue
keep_node = True
node_list.add(node_id)
s = line.find('changeset="',5) + 11
e = line.find('"',s)
cs = int(line[s:e])
changeset_list.add(cs)
# End of node - ignored for now
elif element == '/node':
total_node_count += 1
if keep_node:
total_bytes_to_write += xml_tag_bytes
xml_tag_bytes = 0
keep_node = False
keep_node_count += 1
keep_tag_count += this_tag_count
total_bytes_to_write += xml_tag_bytes
xml_tag_bytes = 0
this_tag_count = 0
# Start of way
elif element == 'way':
s = line.find('id="',4) + 4
e = line.find('"',s)
way_id = int(line[s:e])
s = line.find('changeset="',5) + 11
e = line.find('"',s)
way_cs = int(line[s:e])
way_nodes = set()
keep_way = False
# Way nodes
elif element == 'nd':
total_way_node_count += 1
s = line.find('ref="',1) + 5
e = line.find('"',s)
node_id = int(line[s:e])
way_nodes.add(node_id)
if node_id in node_list:
keep_way = True
if node_id > max_way_node_id:
max_way_node_id = node_id
if node_id < min_way_node_id:
min_way_node_id = node_id
# End of way - check if we keep it
elif element == '/way':
total_way_count += 1
if keep_way:
way_list.add(way_id)
changeset_list.add(way_cs)
# This adds nodes not in BBOX but part of way that intersects it
# This really slows things down!
if resolve_ways:
node_list.update(way_nodes)
keep_way_node_count += len(way_nodes)
keep_tag_count += this_tag_count
keep_way_count += 1
total_bytes_to_write += xml_tag_bytes
total_tag_count += this_tag_count
this_tag_count = 0
keep_way = False
xml_tag_bytes = 0
# Tags are ignored right now
elif element == 'tag':
this_tag_count += 1
elif element == 'relation' and include_relations:
s = line.find('id="',4) + 4
e = line.find('"',s)
rel_id = int(line[s:e])
s = line.find('changeset="',5) + 11
e = line.find('"',s)
rel_cs = int(line[s:e])
relation_ways = set()
relation_nodes = set()
keep_relation = False
elif element == 'member':
this_rel_mem_count += 1
if include_relations:
s = line.find('type="',1) + 6
e = line.find('"',s)
memtype = line[s:e]
s = line.find('ref="',1) + 5
e = line.find('"',s)
member = int(line[s:e])
if memtype == 'way':
relation_ways.add(member)
if member in way_list:
keep_relation = True
else:
relation_nodes.add(member)
if member in node_list:
keep_relation = True
elif element == '/relation':
if keep_relation:
relation_list.add(rel_id)
changeset_list.add(rel_cs)
node_list.update(relation_nodes)
way_list.update(relation_ways)
keep_rel_count += 1
keep_rel_mem_count += this_rel_mem_count
total_bytes_to_write += xml_tag_bytes
keep_tag_count += this_tag_count
total_rel_count += 1
total_rel_mem_count += this_rel_mem_count
this_rel_mem_count = 0
total_tag_count += this_tag_count
this_tag_count = 0
xml_tag_bytes = 0
keep_relation = False
# if element==...
# while True:
print 'Bytes read from OSM file: ' + str(inputfile.getBread())
print 'Bytes to be written: ' + str(total_bytes_to_write)
print "Lines/Elements Scanned: " + str(line_count) + " lines"
print "Total Nodes: " + str(total_node_count)
print "Kept Nodes: " + str(keep_node_count)
print "Total Ways: " + str(total_way_count)
print "Kept Ways: " + str(keep_way_count)
print "Total Way Nodes: " + str(total_way_node_count)
print "Kept Way Nodes: " + str(keep_way_node_count)
print "Total Relations: " + str(total_rel_count)
print "Kept Relations: " + str(keep_rel_count)
print "Total Relation Members:" + str(total_rel_mem_count)
print "Kept Relation Members:" + str(total_rel_mem_count)
print "Total Tags: " + str(total_tag_count)
print "Kept Tags: " + str(keep_tag_count)
print "Changeset list count: " + str(len(changeset_list))
print "Node list count: " + str(len(node_list))
print "Way list count: " + str(len(way_list))
print "Relation list count: " + str(len(relation_list))
print "Node Ids, min("+str(min_node_id)+"),max("+str(max_node_id)+")"
print "Way Node Ids, min("+str(min_way_node_id)+"),max("+str(max_way_node_id)+")"
except Exception, ErrorDesc:
print "Step 1 Failed : " + str(ErrorDesc)
finish = time.clock()
print "Extract incomplete in " + str(finish - start) + " seconds."
sys.exit(-2)
finish = time.clock()
print "Extract complete in " + str(finish - start) + " seconds."