Python script generates .xml change all é to e? [duplicate]

This question already has an answer here:
What is the best way to remove accents in a python unicode string? 5 answers
I have a python script that makes an xml file, in the xml file one of the attributes is and I want to change all the é to just a plain "e". Here is the python script:

    #  Created by Spencer Fontein on 9/28/14.
#  Copyright (c) 2014 Spencer Fontein. All rights reserved.

# coding: utf-8

import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re

#where to send the file at the end
output_path = "/home/spencerf/public_html/rpi/"

def Get_website_text(url):

    # url for website        
    base_url = url

    # file for storing cookies       
    cookie_file = 'mfp.cookies'

    # set up a cookie jar to store cookies
    cj = cookielib.MozillaCookieJar(cookie_file)

    # set up opener to handle cookies, redirects etc
    opener = urllib2.build_opener(
         urllib2.HTTPRedirectHandler(),
         urllib2.HTTPHandler(debuglevel=0),
         urllib2.HTTPSHandler(debuglevel=0),            
         urllib2.HTTPCookieProcessor(cj)
    )

    # pretend we're a web browser and not a python script
    opener.addheaders = [('User-agent',
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
         'AppleWebKit/535.1 (KHTML, like Gecko) '
         'Chrome/13.0.782.13 Safari/535.1'))
    ]

    # open the front page of the website to set
    # and save initial cookies
    response = opener.open(base_url)
    web_text = response.read()
    response.close()

    return web_text


#get union menus
def getUnionMenuUrls(soup):


    monthly_urls = soup.findAll('div',{'id':'accordion_23477'})[0]('a',href=True)[1:3]
    menu_urls = []
    today = datetime.datetime.today() # get todays date
    #print today
    url = "http://ift.tt/YegLus"
    for tag in monthly_urls:
        #print tag
        if ".htm" in tag['href']:
            name = str(tag.text)
            name = name.replace('\n','').replace("'",'').replace(' ','')
            datestrings = name.split('-') # split string and get the list of dates
            date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
            if date_range[0] <= today <= date_range[1]: # check if today in that range
                newurl = url + tag['href']
                menu_urls.append([name,newurl])
    return menu_urls


def get_xml(url):
    tag_stack = []
    output_lines = []

    html = urllib2.urlopen(url).read().replace('&nbsp;',"")
    xml = etree.HTML(html)

    open_tag(tag_stack, output_lines, "menu", "")
    days = xml.xpath('//td[@class="dayouter"]')
    # make the xml for each day
    for day in days:
        day_name = day.xpath('./a/@name')[0]
        safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

        dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr') 
        for dayinner_tr in dayinner_trs:
            # change meal
            if (dayinner_tr.xpath('./td[@class="mealname"]')):
                meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

            # change counter
            if (dayinner_tr.xpath('./td[@class="station"]/text()')):                
                counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
                safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

            # change dish
            if (dayinner_tr.xpath('./td[@class="menuitem"]')):
                item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
                safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
                output_lines.append("<name>%s</name>" % cgi.escape(item_name))

    close_tags(tag_stack, output_lines, "")
    output_string = '\n'.join([line.encode('utf-8') for line in output_lines])

    return output_string

# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
    while tag_stack and tag_stack[-1] != parent_tag:
        top = tag_stack.pop()
        output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
    if name_property:
        output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
    else:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
    tag_stack.append(new_tag)

# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
    if parent_tag not in tag_stack:
        output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
        tag_stack.append(parent_tag)
    else:   
        close_tags(tag_stack, output_lines, parent_tag)
    open_tag(tag_stack, output_lines, new_tag, name_property)

# sample use of get_xml function


# In[17]:

if __name__ == "__main__":
    base_url_u = "http://ift.tt/1qM0Vhe"
    htmltext_u = Get_website_text(base_url_u)
    soup_u = BeautifulSoup(htmltext_u)
    menu_url_list = getUnionMenuUrls(soup_u)
    ofname = str(menu_url_list[0][0].replace("/","")) + "rpi_blitman_menu" + ".xml"
    ofname = ofname.replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','')
    ofname = ofname.replace('7','').replace('7','').replace('8','').replace('9','').replace('0','').replace('-','')
    output_file = output_path + ofname
    open(output_file, "w").write(get_xml(menu_url_list[0][1]))
Not sure if this can be done or if it is easy
Thanks for the help in advance.
Python script generates .xml change all é to e? [duplicate]

No comments:

Post a Comment