Remax File Generation – Your Data Programmer

Created based on the following requirements (defined by ReMax):

CSV Requirements

Contains only properties listed from 2016 [DateListed]
Contains only properties that contain the word “and” in the Description field
CSV ordered by DateListed
Required fields:
MlsId
MlsName
DateListed
StreetAddress
Price
Bedrooms
Bathrooms
Appliances (all sub-nodes comma joined)
Rooms (all sub-nodes comma joined)
Description (the first 200 characters)

Technical Requirements

Interpreter version: python 2.7
Reasonable unit test coverage
Raw information to parse / feed url http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml
This feed must be downloaded from within the script, raw data must not be downloaded manually

The provided requirements included development for python 2, and, since I initially developed for python 3 (thank goodness, I caught this requirements error before submitting!) there are commented-out lines reflecting the difference between python 3 and python 2…just swap the commented out lines to run this code in python 3.

import sys
import os

import xml.etree.ElementTree as et
#python 3:
#import urllib
#python 2:
import urllib2 as urllib


import pandas

def download(url, download_to=""):
    #accepts a url pointing to an xml file
    #and a file system path for saving the downloaded file

    #assumes provided paths are valid
    #assumes file is small enough to load into memory
    #a more robust version of the code
    #would include error checking and
    #provide support for large file downloads

    #python 3:
    #file = urllib.request.urlopen(url)

    #python 2:
    file = urllib.urlopen(url)

    data = file.read()
    file.close()

    if download_to is not "":
        df=open(download_to, "w")
        df.write(data)
        df.close()
        return download_to

    return data

#this function can be expanded to implement more complex filters in the future
def include(filter,value_to_check):
    include=str(filter) in str(value_to_check).lower()
    return include

#gathering all of the fields into a data frame makes it accessible for
#futher analysis or export to other formats
def gather_fields(xmlroot,iter_tag,filter_cols,csv_cols,df_cols):
    df = pandas.DataFrame(columns=df_cols)

    for listing in xmlroot.iter(tag=iter_tag):
        detaildata={}

        for category in listing:
            for data in category:
                if data.tag in df_cols:
                    #join data values from children for csv columns
                    if data.tag in(csv_cols):
                        detaildata[data.tag]=[', '.join([d.text for d in data])]
                    #apply filtersfor filter columns
                    if data.tag in (filter_cols.keys()):
                        #there are two description fields in the data but only one in the requirements
                        #in a 'real life' setting, we should double-check the requirements.
                        #for a one/two hour code sample, we'll just make a decision to exclude one field :)
                        if data.tag=='Description' and 'BasicDetails' not in category.tag :
                                break
                        filtermatch=include(filter_cols[data.tag],data.text)
                        if filtermatch:
                            detaildata[data.tag] = [data.text]
                    else:
                        detaildata[data.tag]=[data.text]

        #compare the columns we have to the filter columns
        filtered=[col for col in filter_cols if col in detaildata.keys()]
        if len(filtered)==len(filter_cols.values()):
            #this is a row we want to include, so add it to our data frame
            row=pandas.DataFrame.from_dict(detaildata)
            df=df.append(row,ignore_index=True,sort=False)

    return df

def getxmlroot(url=None,downloaded=True,localxml=None):

    xmlroot = None

    if (not downloaded) and (localxml is None) and (url is not None):
        # fetch data from the url directly into the xmlroot
        xmldata = download(url)
        xmlroot = et.fromstring(xmldata)

    elif (not downloaded) and (localxml is not None) and (url is not None):
        # download the file and set the xml root
        download(url, localxml)
        xmlroot = et.parse(localxml).getroot()

    elif downloaded and localxml is not None:
        xmlroot = et.parse(localxml).getroot()

    return xmlroot

def write_data(df,filepath):
    open(filepath,'w')
    df.to_csv(filepath, index=False)
    return

if __name__=='__main__':
    #parameters are passed as a comma delimited list in the form <arg_name>=arg_value
    #this is a short project, so it's still a bit fragile...bad parameter values can break the script
    #the script defines default values and, depending on your system setup, it may run successfully
    # if no parameters are passed

    #we can accept these parameters from sys.argv
    allowable_args=['url','localxml','downloaded','iter_tag','filepath']

    args={}
    if len(sys.argv)>1: #parameters were passed
        for arg in sys.argv[1:len(sys.argv)]:
            x=str(arg).find('=')
            label=arg[:x]
            value=arg[x+1:]
            args[label]=value

    #let's get our hands on an xml root
    #we need the url of the xml, if we're downloading from a url
    url = 'http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml'
    if 'url' in args.keys():
        url=args['url']
    #if we want to download the data to a local xml file, we need to know where to save it
    localxml=None
    if 'localxml' in args.keys():
        localxml=args['localxml']

    #if we've already downloaded, no need to download again
    downloaded= False
    if 'downloaded' in args.keys():
        if downloaded=='False':
            downloaded=False
        if downloaded=='True':
            downloaded=True

    #to process a file from url without downloading a local copy
    #pass an empty value for the localxml file and set downloaded to False
    xmlroot=getxmlroot(url,downloaded=False,localxml=None)

    #once we have a parsed xml file, we can gather all of the field data into a dataframe
    #we need a list of columns that we want to extract from the xml tree
    df_cols = ['MlsId','MlsName','DateListed','StreetAddress','Price','Bedrooms','Bathrooms','Appliances','Rooms', 'Description']
    if 'df_cols' in args.keys():
        df_cols=args['df_cols'].split(',')
    #these are the columns that are populated from their child nodes
    csv_cols=['Appliances','Rooms']

    #key is a  node name, value is the text we want to find in the node
    filter_cols={'DateListed':'2016','Description':' and '}

    #this is the tag associated with each item that we want to gather data for
    iter_tag='Listing'

    #unexpected values in the passed-in lists might break this
    #in a production environment, it would be vigorously tested, but
    #for a short coding project, a warning that it's fragile will suffice
    df=gather_fields(xmlroot,iter_tag,filter_cols,csv_cols,df_cols)

    #finally we're going to write out our collected data
    #no matter what extension you provide for your filename, it will be written out in csv format
    filepath=os.sep.join([os.getcwd(),'remax.csv'])

    #tada. done. go check out the csv
    write_data(df,filepath)

You can also view the full project on GitHub!