![]() |
|
![]() |
|
|
Thread Tools | Display Modes |
|
|
#1 |
|
Programming Guru
![]() Join Date: Apr 2005
Posts: 1,799
Rep Power: 5
![]() |
BBCode Parser 3.0
All right, I took a shot at a much more thought out approach at parsing BBCode after my last two attempts weren't exactly sufficient. The neatest part is this is actually better than the BBCode Parser on this forum. It caches using a sha hash. With every code tag, there's a link that automatically copies the contents of the code straight to your clip board!
![]() I chose not to use Beautiful Soup, mainly because I was without an internet connection while programming this. Here is a demonstration of the BBCode Parser: http://jammersbase.com/forum?f=0&t=16&page=1 Here's the source code, I hope it's a nice approach. import sha
def try_int(x):
try:
int(x)
return True
except ValueError:
return False
class bbcode:
def __init__(self):
self.alphabet = [chr(l) for l in range(97,123)] + [chr(l) for l in range(65,91)]
self.hexset = [chr(l) for l in range(48,58)] + [chr(l) for l in range(97,103)] + [chr(l) for l in range(65,71)]
self.urlaccept = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'=', '?', '.', '+', '\\', '&', '_', '-', '/', ':', ';', '#')
self.accept_tags = ('url', 'img',
'b', 'i', 'u',
'font', 'color', 'size',
'li', 'left', 'center', 'right',
'quote', 'code',
'noparse')
self.colors = {'red':'f00', 'green':'0f0', 'blue':'00f',
'black':'000', 'white':'fff', 'gray':'666',
'yellow':'ff0', 'magenta':'f0f', 'cyan':'0ff',
'orange':'f60'}
self.sizes = [str(l) for l in range(1,9)]
self.fonts = ('arial', 'courier', 'courier new', 'verdana', 'comic sans')
# defaults are used in the case of an illegal/missing parameter
self.default_color = '000'
self.default_size = '11'
self.default_font = 'verdana'
# the magic: posts need to be parsed repeatedly, why not cache them?
self.cache = {}
def reset_open(self):
self.open_init = 0
self.init_param = False
self.param = ''
self.open_tag = ''
def reset_close(self):
self.close_init = 0
self.close_tag = ''
def reset(self):
self.reset_open()
self.reset_close()
def overhead_analysis(self, bbcode):
self.reset()
self.tags = []
for i in range(len(bbcode)):
l = bbcode[i]
if l == '[' and not self.close_init:
# initiate open tag
self.reset_open()
self.open_init = i+1 # +1 in case i = 0, to keep true
elif l == '/' and not self.open_tag:
# initiate close tag, latter clause to assert '/' did not appear as data
self.reset_open()
self.close_init = i+1 # +1 in case i = 0, to keep true
elif l == '=' and not self.init_param and self.open_tag:
# initiate parameter handling, middle clause to assert '=' did not appear as data
self.init_param = True
elif l == ']' and self.open_tag:
# record open tag
tag_name = self.open_tag.lower()
if tag_name in self.accept_tags:
self.tags.append( (True, tag_name, bbcode[self.open_init-1:i+1], i+1, self.param) )
self.reset_open()
elif l == ']' and self.close_tag:
# record close tag
tag_name = self.close_tag.lower()
if tag_name in self.accept_tags:
self.tags.append( (False, tag_name, bbcode[self.close_init-2:i+1], self.close_init-2) )
self.reset_close()
elif self.init_param:
# record parameter
self.param += l
elif self.close_init:
# record close tag
if l in self.alphabet:
self.close_tag += l
else:
self.reset_close() # if the '[' appeared as data
elif self.open_init:
# record open tag
if l in self.alphabet:
self.open_tag += l
else:
self.reset_open() # if the '[' appeared as data
return self.tags
def html_codes(self, value):
value = value.replace('"', '"')
value = value.replace('&', '&')
return value
def color_codes(self, value):
try:
return self.colors[value.lower()]
except KeyError:
if not len(value) in (3, 6):
return self.default_color
for char in value:
if not char in self.hexset:
return self.default_color
return value
def make_size(self, size):
if not try_int(size) or not size in self.sizes:
return self.default_size
return str(6 + int(size)*2)
def choose_font(self, font):
if not font.lower() in self.fonts:
return self.default_font
if ' ' in font:
return "'%s'"%font
return font
def eliminate_pair(self, bbcode, data, a, b):
# make sure we are popping in the right order
# and parsing without missing anything in between
# (offseting wasn't built to handle things in between)
assert b == a + 1
adata = data[a]
bdata = data[b]
data.pop(b)
data.pop(a)
tag = adata[1]
value = bbcode[adata[3]:bdata[3]]
param = adata[4]
original = adata[2]+value+bdata[2]
if tag == 'b':
new = '<b>%s</b>'%value
elif tag == 'u':
new = '<u>%s</u>'%value
elif tag == 'i':
new = '<i>%s</i>'%value
elif tag == 'quote':
if param:
new = 'Quote By: <b>%s</b><div class="quote">%s</div>'%(param, value)
else:
new = 'Quote:<div class="quote">%s</div>'%value
elif tag == 'code':
# we could use <pre> tags, but those seem to destroy multiple newlines!!
new_value = value.replace(' ', ' ')
# this is the div id for copying to clipboard
value_hash = sha.sha(new_value).hexdigest()
if param:
code = '<b>%s</b> '%param
else:
code = ''
new = '''%sCode: <a href="javascript:copy('%s')">Copy To Clipboard</a><div class="code" id="%s">%s</div>'''%(code, value_hash, value_hash, new_value)
elif tag == 'url':
if param:
valid_url = self.html_codes(param)
else:
valid_url = self.html_codes(value)
new = '<a href="%s" target="_blank" title="%s">%s</a>'%(valid_url, valid_url, value)
elif tag == 'img':
if param:
valid_url = self.html_codes(param)
alt = self.html_codes(value)
else:
valid_url = self.html_codes(value)
alt = valid_url
new = '<img src="%s" class="inlineimg" alt="%s" title="%s" />'%(valid_url, alt, alt)
elif tag == 'size':
new = '<span style="font-size:%spx">%s</span>'%(self.make_size(param), value)
elif tag == 'font':
new = '<span style="font-family:%s">%s</span>'%(self.choose_font(param), value)
elif tag == 'color':
new = '<span style="color:#%s">%s</span>'%(self.color_codes(param), value)
elif tag == 'li':
new = '<div class="indent">%s</div>'%value
elif tag == 'left':
new = '<div style="text-align:left">%s</div>'%value
elif tag == 'center':
new = '<div style="text-align:center">%s</div>'%value
elif tag == 'right':
new = '<div style="text-align:right">%s</div>'%value
elif tag == 'noparse':
new = value
# insert "new" in place of the tags and value, alternative to .replace(original, new)
bbcode = bbcode[:adata[3]-len(adata[2])] + new + bbcode[bdata[3]+len(bdata[2]):]
# offset each subsequent item by the amount we changed
offset = len(new)-len(original)
if offset:
# is there a better way to modify a list?
newdata = []
for i in range(len(data)):
if i > b - 2: # -2 because we popped two items
datai = data[i]
if datai[0]:
# is there a better way to modify this tuple?
newdata.append((datai[0], datai[1], datai[2], datai[3]+offset, datai[4]))
else:
newdata.append((datai[0], datai[1], datai[2], datai[3]+offset))
else:
newdata.append(data[i])
data = newdata[:]
return bbcode, data
def make_pairs(self, bbcode, data):
# this will only fully parse a "cleaned" data set, with
# fully validated bbcode, otherwise data will not be empty
for i in range(len(data)-1):
# if two similar tags are before eachother
if data[i][1] == data[i+1][1]:
# and it goes open/close
if data[i][0] and not data[i+1][0]:
bbcode, data = self.eliminate_pair(bbcode, data, i, i+1)
# recursive iteration, lol ...
return self.make_pairs(bbcode, data)
return bbcode
def mk_bbcode(self, text, signal, add=''):
# we will add bbcode to things we think are urls
# count backwards through a list split by http:// or www.
# until we reach the end
while signal in text:
items = text.split(signal)
m = len(items)
i = 2
while 1:
prior = items[-i]
if not prior:
# nothing is before the http:// or www.
break
elif prior[-5:-1] == '[img' or prior[-5:-1] == '[url' or prior[-1] in self.alphabet or prior[-7:] == 'http://':
# something illegal is before the http:// or www.
# move backwards through the list
i += 1
if i > m:
# end of the list
return text
else:
# nothing illegal is before the http:// or www.
break
start = items[-i]
url = items[-i+1]
# we must guess where the URL ends...
# we assume that involves no letters or characters specified in self.urlaccept
l = 0
for char in url:
if not char in self.urlaccept and not char in self.alphabet:
break
l += 1
url = url[:l]
end = url[l:]
# add in the start and end, to avoid replacing the incorrect instance
text = text.replace(start+signal+url+end, '%s%s%s%s%s'%(start, add, signal, url, end), 1)
return text
def noparse_tags(self, data):
i = 0
noparse_state = 0
noparse_start = None
while i < len(data):
tag = data[i]
if tag[1] == 'noparse':
# add one if open, subtract one if closed
noparse_state += (-1, +1)[tag[0]]
if noparse_start == None:
noparse_start = i
# if the tags balance out
if noparse_state == 0:
# delete everything between noparse
for instance in range(i-noparse_start-1):
data.pop(noparse_start+1)
# reset i to where it found noparse
# subtract 1 because it will add 1 after the loop
# add 2 because we left behind two tags
i = noparse_start - 1 + 2
# if it balanced out /or/ was someone being silly
if noparse_state < 1:
noparse_state = 0
noparse_start = None
i += 1
return data
def parse(self, bbcode):
bbhash = sha.sha(bbcode).hexdigest()
try:
return self.cache[bbhash]
except KeyError:
pass
#bbcode = bbcode.replace('<br />', '\n')
bbcode = bbcode.replace('<', '<')
bbcode = bbcode.replace('>', '>')
# add bbcode to things we think are urls a silly division in to two function calls
# I need a better way to do this ...
bbcode = self.mk_bbcode(bbcode, 'http://')
bbcode = self.mk_bbcode(bbcode, 'www.', 'http://')
data = self.overhead_analysis(bbcode)
# future consideration: clean out inproper bbcode to make way for more "generous" parsing
#data = self.clean(data)
data = self.noparse_tags(data)
bbcode = self.make_pairs(bbcode, data)
#bbcode, data = self.eliminate_pair(bbcode, data, 1, 2)
# replace this after, so the space in <br /> doesn't screw up the code's replacement of spaces
bbcode = bbcode.replace('\n', '<br />')
self.cache[bbhash] = bbcode
return bbcode |
|
|
|
|
|
#2 |
|
Expert Programmer
|
I believe a significant number of people don't bother to close BBcode tags in the order they open them. I would definitely want to fix this if I were to use this code on a forum of my own.
Another nice feature (I don't know if your code already does this) would be to replace any non-alphanumeric characters, excepting select special characters, inside CODE tags with a space. I often copy and paste code into my PHP scripts and end up having to use BBEdit to zap invisible characters that are corrupting the code. By the way, Sane, if you don't mind my asking; What web host do you use to host your Python/CherryPy pages? My host (MediaCatch) only supports Perl and PHP. Last edited by titaniumdecoy; Sep 7th, 2006 at 5:01 PM. |
|
|
|
|
|
#3 |
|
Programming Guru
![]() Join Date: Apr 2005
Posts: 1,799
Rep Power: 5
![]() |
I use my own computer.
|
|
|
|
|
|
#4 |
|
Programmer
|
it looks good and fairly understandible at a glance. But a relatively simple BBcode parser can also be written with regular expressions
Here is an incomplete set of BBCode->XHTML regexen in YAML (which can be used with a few lines of python) (strings in '' are like r'' in py)bbcode:
- [ "&", "&"]
- [ "<", "<" ]
- [ ">", ">" ]
- [ "'", ''' ]
- [ '"', '"' ]
- [ '(\r\n|\r|\n)', '<br/>\1']
- [ '\[[Bb]\](.+?)\[/[Bb]\]', '<b>\1</b>' ]
- [ '\[[Ii]\](.+?)\[/[Ii]\]', '<i>\1</i>' ]
- [ '\[[Uu]\](.+?)\[/[Uu]\]', '<u>\1</u>' ]
- [ '\[[Hh]([1-6])\](.+?)\[/[Hh]\1\]', '<h\1>\2</h\1>' ]
- [ '(\s|^)(http|ftp)(://[^\s:]+)', '\1<a href="\2\3">\2\3</a>' ]
- [ '(\s|^)(www\.[^\s/]+\.[a-zA-Z]{2,4}(/[^\s/]*)*)', '\1<a href="http://\2">\2</a>' ]
- [ '(\s|^)(ftp\.[^\s/]+\.[a-zA-Z]{2,4}(/[^\s/]*)*)', '\1<a href="ftp://\2">\2</a>' ]
- [ '(\s|^)([a-zA-Z0-9.]+@[a-zA-Z0-9.]+\.[a-zA-Z]{2,4})', '\1<a href="mailto:\2">\2</a>' ]
- [ '\[[Uu][Rr][Ll]\](.+?)\[/[Uu][Rr][Ll]\]', '<a href="\1">\1</a>' ]
- ['\[[Uu][Rr][Ll]=([^\]]+)\](.+?)\[/[Uu][Rr][Ll]\]', '<a href="\1">\2</a>']
- [ '\[[Ii][Mm][Gg]\](.+?\.(png|gif|jpe?g|svg))\[/[Ii][Mm][Gg]\]', '<img src="\1"/>' ] |
|
|
|
![]() |
| Bookmarks |
| Currently Active Users Viewing This Thread: 1 (0 members and 1 guests) | |
| Thread Tools | |
| Display Modes | |
|
|
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Need help with BBCode | metsfan | JavaScript and Client-Side Browser Scripting | 1 | Aug 14th, 2006 5:53 PM |
| writing a parser in c++ | programmingnoob | C++ | 47 | Jul 24th, 2006 3:28 AM |
| Delphi's Parser Type | SittingDuck | Delphi | 1 | Jan 22nd, 2006 9:51 AM |
| Recursive Decent Parser | Jhaqen | Java | 2 | Jan 14th, 2006 6:22 AM |
| SGML parser | printf | PHP | 1 | Oct 17th, 2005 4:52 PM |