Created based on the following requirements (defined by ReMax):
CSV Requirements
- Contains only properties listed from 2016 [DateListed]
- Contains only properties that contain the word “and” in the Description field
- CSV ordered by DateListed
- Required fields:
- MlsId
- MlsName
- DateListed
- StreetAddress
- Price
- Bedrooms
- Bathrooms
- Appliances (all sub-nodes comma joined)
- Rooms (all sub-nodes comma joined)
- Description (the first 200 characters)
Technical Requirements
- Interpreter version: python 2.7
- Reasonable unit test coverage
- Raw information to parse / feed url http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml
- This feed must be downloaded from within the script, raw data must not be downloaded manually
The provided requirements included development for python 2, and, since I initially developed for python 3 (thank goodness, I caught this requirements error before submitting!) there are commented-out lines reflecting the difference between python 3 and python 2…just swap the commented out lines to run this code in python 3.
import sys
import os
import xml.etree.ElementTree as et
#python 3:
#import urllib
#python 2:
import urllib2 as urllib
import pandas
def download(url, download_to=""):
#accepts a url pointing to an xml file
#and a file system path for saving the downloaded file
#assumes provided paths are valid
#assumes file is small enough to load into memory
#a more robust version of the code
#would include error checking and
#provide support for large file downloads
#python 3:
#file = urllib.request.urlopen(url)
#python 2:
file = urllib.urlopen(url)
data = file.read()
file.close()
if download_to is not "":
df=open(download_to, "w")
df.write(data)
df.close()
return download_to
return data
#this function can be expanded to implement more complex filters in the future
def include(filter,value_to_check):
include=str(filter) in str(value_to_check).lower()
return include
#gathering all of the fields into a data frame makes it accessible for
#futher analysis or export to other formats
def gather_fields(xmlroot,iter_tag,filter_cols,csv_cols,df_cols):
df = pandas.DataFrame(columns=df_cols)
for listing in xmlroot.iter(tag=iter_tag):
detaildata={}
for category in listing:
for data in category:
if data.tag in df_cols:
#join data values from children for csv columns
if data.tag in(csv_cols):
detaildata[data.tag]=[', '.join([d.text for d in data])]
#apply filtersfor filter columns
if data.tag in (filter_cols.keys()):
#there are two description fields in the data but only one in the requirements
#in a 'real life' setting, we should double-check the requirements.
#for a one/two hour code sample, we'll just make a decision to exclude one field :)
if data.tag=='Description' and 'BasicDetails' not in category.tag :
break
filtermatch=include(filter_cols[data.tag],data.text)
if filtermatch:
detaildata[data.tag] = [data.text]
else:
detaildata[data.tag]=[data.text]
#compare the columns we have to the filter columns
filtered=[col for col in filter_cols if col in detaildata.keys()]
if len(filtered)==len(filter_cols.values()):
#this is a row we want to include, so add it to our data frame
row=pandas.DataFrame.from_dict(detaildata)
df=df.append(row,ignore_index=True,sort=False)
return df
def getxmlroot(url=None,downloaded=True,localxml=None):
xmlroot = None
if (not downloaded) and (localxml is None) and (url is not None):
# fetch data from the url directly into the xmlroot
xmldata = download(url)
xmlroot = et.fromstring(xmldata)
elif (not downloaded) and (localxml is not None) and (url is not None):
# download the file and set the xml root
download(url, localxml)
xmlroot = et.parse(localxml).getroot()
elif downloaded and localxml is not None:
xmlroot = et.parse(localxml).getroot()
return xmlroot
def write_data(df,filepath):
open(filepath,'w')
df.to_csv(filepath, index=False)
return
if __name__=='__main__':
#parameters are passed as a comma delimited list in the form <arg_name>=arg_value
#this is a short project, so it's still a bit fragile...bad parameter values can break the script
#the script defines default values and, depending on your system setup, it may run successfully
# if no parameters are passed
#we can accept these parameters from sys.argv
allowable_args=['url','localxml','downloaded','iter_tag','filepath']
args={}
if len(sys.argv)>1: #parameters were passed
for arg in sys.argv[1:len(sys.argv)]:
x=str(arg).find('=')
label=arg[:x]
value=arg[x+1:]
args[label]=value
#let's get our hands on an xml root
#we need the url of the xml, if we're downloading from a url
url = 'http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml'
if 'url' in args.keys():
url=args['url']
#if we want to download the data to a local xml file, we need to know where to save it
localxml=None
if 'localxml' in args.keys():
localxml=args['localxml']
#if we've already downloaded, no need to download again
downloaded= False
if 'downloaded' in args.keys():
if downloaded=='False':
downloaded=False
if downloaded=='True':
downloaded=True
#to process a file from url without downloading a local copy
#pass an empty value for the localxml file and set downloaded to False
xmlroot=getxmlroot(url,downloaded=False,localxml=None)
#once we have a parsed xml file, we can gather all of the field data into a dataframe
#we need a list of columns that we want to extract from the xml tree
df_cols = ['MlsId','MlsName','DateListed','StreetAddress','Price','Bedrooms','Bathrooms','Appliances','Rooms', 'Description']
if 'df_cols' in args.keys():
df_cols=args['df_cols'].split(',')
#these are the columns that are populated from their child nodes
csv_cols=['Appliances','Rooms']
#key is a node name, value is the text we want to find in the node
filter_cols={'DateListed':'2016','Description':' and '}
#this is the tag associated with each item that we want to gather data for
iter_tag='Listing'
#unexpected values in the passed-in lists might break this
#in a production environment, it would be vigorously tested, but
#for a short coding project, a warning that it's fragile will suffice
df=gather_fields(xmlroot,iter_tag,filter_cols,csv_cols,df_cols)
#finally we're going to write out our collected data
#no matter what extension you provide for your filename, it will be written out in csv format
filepath=os.sep.join([os.getcwd(),'remax.csv'])
#tada. done. go check out the csv
write_data(df,filepath)
You can also view the full project on GitHub!