Here's a semi-useful function I created when I ran into a problem while parsing html.
def splitwithout(text, s, void):
res = []
lt = -1
on = (False,'')
for t in range(len(text)):
if text[t] in void:
if not on[0]:
on = (True, text[t])
elif text[t] == on[1]:
on = (False, '')
if text[t] == s and not on[0]:
res += [text[lt+1:t]]
lt = t
res += [text[lt+1:]]
return res
It does what its name says, splits without (the instance of the items passed through void).
e.g.
print splitwithout("""a b "c d" e f 'g h i' j k l""",
" ",
('"',"'")) outputs:
>>> ['a', 'b', '"c d"', 'e', 'f', "'g h i'", 'j', 'k', 'l']
Notice how the spaces in between "c d" and 'g h i' were not split as split() would normally do (since I passed '"', "'" through void)? The point of this function is so I can do split an html tag by its spacing, without taking a space that could be inside the paramaters as well.
I wonder if this is actually faster then split() is.
Edit:
If you still don't see the point, here's what I was using it for.
def parse_tag(text, open = '<', close = '>'):
first = splitwithout(text.split(open)[1].split(close)[0], ' ', ('"',"'"))
tag = first[0]
if tag == ' ':
return False
params = {}
for p in first[1:]:
pp = p.split('=')
try:
param = pp[0]
val = pp[1].split(pp[1][0])[1]
except IndexError: return False
params[pp[0]] = val
try:
content = text.split(open+"/"+tag+close)[0].split(close)[1]
except IndexError:
return False
return tag, params, content
tag, params, content = parse_tag("""<a href='http://google.ca' title="Hyperlink to Google.ca">Hyperlink</a>""")
print tag #'a'
print params #{'href': 'http://google.ca', 'title': 'Hyperlink to Google.ca'}
print content #Hyperlink Very helpful I think.