Python script generates .xml change all é to e? [duplicate]




This question already has an answer here:




I have a python script that makes an xml file, in the xml file one of the attributes is and I want to change all the é to just a plain "e". Here is the python script:



# Created by Spencer Fontein on 9/28/14.
# Copyright (c) 2014 Spencer Fontein. All rights reserved.

# coding: utf-8

import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re

#where to send the file at the end
output_path = "/home/spencerf/public_html/rpi/"

def Get_website_text(url):

# url for website
base_url = url

# file for storing cookies
cookie_file = 'mfp.cookies'

# set up a cookie jar to store cookies
cj = cookielib.MozillaCookieJar(cookie_file)

# set up opener to handle cookies, redirects etc
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)

# pretend we're a web browser and not a python script
opener.addheaders = [('User-agent',
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
'AppleWebKit/535.1 (KHTML, like Gecko) '
'Chrome/13.0.782.13 Safari/535.1'))
]

# open the front page of the website to set
# and save initial cookies
response = opener.open(base_url)
web_text = response.read()
response.close()

return web_text


#get union menus
def getUnionMenuUrls(soup):


monthly_urls = soup.findAll('div',{'id':'accordion_23477'})[0]('a',href=True)[1:3]
menu_urls = []
today = datetime.datetime.today() # get todays date
#print today
url = "http://ift.tt/YegLus"
for tag in monthly_urls:
#print tag
if ".htm" in tag['href']:
name = str(tag.text)
name = name.replace('\n','').replace("'",'').replace(' ','')
datestrings = name.split('-') # split string and get the list of dates
date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
if date_range[0] <= today <= date_range[1]: # check if today in that range
newurl = url + tag['href']
menu_urls.append([name,newurl])
return menu_urls


def get_xml(url):
tag_stack = []
output_lines = []

html = urllib2.urlopen(url).read().replace('&nbsp;',"")
xml = etree.HTML(html)

open_tag(tag_stack, output_lines, "menu", "")
days = xml.xpath('//td[@class="dayouter"]')
# make the xml for each day
for day in days:
day_name = day.xpath('./a/@name')[0]
safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr')
for dayinner_tr in dayinner_trs:
# change meal
if (dayinner_tr.xpath('./td[@class="mealname"]')):
meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

# change counter
if (dayinner_tr.xpath('./td[@class="station"]/text()')):
counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

# change dish
if (dayinner_tr.xpath('./td[@class="menuitem"]')):
item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
output_lines.append("<name>%s</name>" % cgi.escape(item_name))

close_tags(tag_stack, output_lines, "")
output_string = '\n'.join([line.encode('utf-8') for line in output_lines])

return output_string

# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
while tag_stack and tag_stack[-1] != parent_tag:
top = tag_stack.pop()
output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
if name_property:
output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
else:
output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
tag_stack.append(new_tag)

# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
if parent_tag not in tag_stack:
output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
tag_stack.append(parent_tag)
else:
close_tags(tag_stack, output_lines, parent_tag)
open_tag(tag_stack, output_lines, new_tag, name_property)

# sample use of get_xml function


# In[17]:

if __name__ == "__main__":
base_url_u = "http://ift.tt/1qM0Vhe"
htmltext_u = Get_website_text(base_url_u)
soup_u = BeautifulSoup(htmltext_u)
menu_url_list = getUnionMenuUrls(soup_u)
ofname = str(menu_url_list[0][0].replace("/","")) + "rpi_blitman_menu" + ".xml"
ofname = ofname.replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','')
ofname = ofname.replace('7','').replace('7','').replace('8','').replace('9','').replace('0','').replace('-','')
output_file = output_path + ofname
open(output_file, "w").write(get_xml(menu_url_list[0][1]))


Not sure if this can be done or if it is easy


Thanks for the help in advance.


No comments:

Post a Comment