Loading pybtex/bibtex/utils.py +96 −28 Original line number Diff line number Diff line Loading @@ -447,23 +447,87 @@ def split_name_list(string): return split_tex_string(string, ' [Aa][Nn][Dd] ') @fix_unicode_literals_in_doctest def _find_closing_brace(string): """ >>> _find_closing_brace('') (u'', u'') >>> _find_closing_brace('no braces') (u'no braces', u'') >>> _find_closing_brace('brace at the end}') (u'brace at the end}', u'') >>> _find_closing_brace('two closing braces}}') (u'two closing braces}', u'}') >>> _find_closing_brace('two closing} braces} and some text') (u'two closing}', u' braces} and some text') >>> _find_closing_brace('more {nested{}}{braces}} and the rest}') (u'more {nested{}}{braces}}', u' and the rest}') """ up_to_brace = [] brace_level = 1 while brace_level >= 1: next_brace = BRACE_RE.search(string) if not next_brace: break up_to_brace.append(string[:next_brace.end()]) string = string[next_brace.end():] if next_brace.group() == '{': brace_level += 1 elif next_brace.group() == '}': brace_level -= 1 else: raise ValueError(next_brace.group()) if not up_to_brace: up_to_brace, string = [string], '' return ''.join(up_to_brace), string @fix_unicode_literals_in_doctest def _split(string, sep): """ >>> sep = re.compile(',') >>> list(_split('', sep)) [u''] >>> list(_split('a,b,c', sep)) [u'a', u'b', u'c'] >>> list(_split('a,b,c,', sep)) [u'a', u'b', u'c', u''] """ while True: match = sep.search(string) if not match: yield string break else: yield string[:match.start()] string = string[match.end():] # "\ " is a "control space" in TeX, i. e. "a space that is not to be ignored" # -- The TeXbook, Chapter 3: Controlling TeX, p 8 # ~ is a space character, according to BibTeX # \~ is not a space character BIBTEX_SPACE_RE = re.compile(r'(\\ |\s|(?<!\\)~)+') BRACE_RE = re.compile(r'{|}') @fix_unicode_literals_in_doctest def split_tex_string(string, sep=None, strip=True, filter_empty=False): r"""Split a string using the given separator (regexp). Everything at brace level > 0 is ignored. Separators at the edges of the string are ignored. >>> split_tex_string('') [] >>> split_tex_string(' ') [] >>> split_tex_string(' ', ' ', strip=False, filter_empty=False) [u' ', u' '] >>> split_tex_string('.a.b.c.', r'\.') [u'.a', u'b', u'c.'] [u'', u'a', u'b', u'c', u''] >>> split_tex_string('.a.b.c.{d.}.', r'\.') [u'.a', u'b', u'c', u'{d.}.'] [u'', u'a', u'b', u'c', u'{d.}', u''] >>> split_tex_string('Matsui Fuuka') [u'Matsui', u'Fuuka'] >>> split_tex_string('{Matsui Fuuka}') Loading @@ -476,35 +540,39 @@ def split_tex_string(string, sep=None, strip=True, filter_empty=False): [u'a'] >>> split_tex_string('on a') [u'on', u'a'] >>> split_tex_string(r'Qui\~{n}onero-Candela, J.') [u'Qui\\~{n}onero-Candela,', u'J.'] """ if sep is None: # "\ " is a "control space" in TeX, # i. e. "a space that is not to be ignored" # The TeXbook, Chapter 3: Controlling TeX, p 8 sep = r'(\\ |[\s~])+' sep = BIBTEX_SPACE_RE filter_empty = True sep_re = re.compile(sep) brace_level = 0 name_start = 0 sep = re.compile(sep) result = [] string_len = len(string) pos = 0 for pos, char in enumerate(string): if char == '{': brace_level += 1 elif char == '}': brace_level -= 1 elif brace_level == 0 and pos > 0: match = sep_re.match(string[pos:]) if match: sep_len = len(match.group()) if pos + sep_len < string_len: result.append(string[name_start:pos]) name_start = pos + sep_len if name_start < string_len: result.append(string[name_start:]) word_parts = [] while True: head, brace, string = string.partition('{') if head: head_parts = list(_split(head, sep)) for word in head_parts[:-1]: result.append(''.join(word_parts + [word])) word_parts = [] word_parts.append(head_parts[-1]) if brace: word_parts.append(brace) up_to_closing_brace, string = _find_closing_brace(string) word_parts.append(up_to_closing_brace) else: break if word_parts: result.append(''.join(word_parts)) if strip: result = [part.strip() for part in result] if filter_empty: Loading tests/parse_name_test.py +3 −2 Original line number Diff line number Diff line Loading @@ -173,8 +173,9 @@ sample_names = [ (['Johannes', 'Adam', 'Ferdinand', 'Alois', 'Josef', 'Maria', 'Marko'], ["d'Aviano", 'Pius', 'von', 'und', 'zu'], ['Liechtenstein'],[])), # sort of wrong, but BibTeX parses it like this (r'Brand\~{a}o, F', (['F'], [], ['Brand\\', '{a}o'], [])), (r'Brand\~{a}o, F', (['F'], [], [r'Brand\~{a}o'], [])), # but BibTeX parses it like this: # (r'Brand\~{a}o, F', (['F'], [], ['Brand\\', '{a}o'], [])), # incorrectly formatted name strings below Loading Loading
pybtex/bibtex/utils.py +96 −28 Original line number Diff line number Diff line Loading @@ -447,23 +447,87 @@ def split_name_list(string): return split_tex_string(string, ' [Aa][Nn][Dd] ') @fix_unicode_literals_in_doctest def _find_closing_brace(string): """ >>> _find_closing_brace('') (u'', u'') >>> _find_closing_brace('no braces') (u'no braces', u'') >>> _find_closing_brace('brace at the end}') (u'brace at the end}', u'') >>> _find_closing_brace('two closing braces}}') (u'two closing braces}', u'}') >>> _find_closing_brace('two closing} braces} and some text') (u'two closing}', u' braces} and some text') >>> _find_closing_brace('more {nested{}}{braces}} and the rest}') (u'more {nested{}}{braces}}', u' and the rest}') """ up_to_brace = [] brace_level = 1 while brace_level >= 1: next_brace = BRACE_RE.search(string) if not next_brace: break up_to_brace.append(string[:next_brace.end()]) string = string[next_brace.end():] if next_brace.group() == '{': brace_level += 1 elif next_brace.group() == '}': brace_level -= 1 else: raise ValueError(next_brace.group()) if not up_to_brace: up_to_brace, string = [string], '' return ''.join(up_to_brace), string @fix_unicode_literals_in_doctest def _split(string, sep): """ >>> sep = re.compile(',') >>> list(_split('', sep)) [u''] >>> list(_split('a,b,c', sep)) [u'a', u'b', u'c'] >>> list(_split('a,b,c,', sep)) [u'a', u'b', u'c', u''] """ while True: match = sep.search(string) if not match: yield string break else: yield string[:match.start()] string = string[match.end():] # "\ " is a "control space" in TeX, i. e. "a space that is not to be ignored" # -- The TeXbook, Chapter 3: Controlling TeX, p 8 # ~ is a space character, according to BibTeX # \~ is not a space character BIBTEX_SPACE_RE = re.compile(r'(\\ |\s|(?<!\\)~)+') BRACE_RE = re.compile(r'{|}') @fix_unicode_literals_in_doctest def split_tex_string(string, sep=None, strip=True, filter_empty=False): r"""Split a string using the given separator (regexp). Everything at brace level > 0 is ignored. Separators at the edges of the string are ignored. >>> split_tex_string('') [] >>> split_tex_string(' ') [] >>> split_tex_string(' ', ' ', strip=False, filter_empty=False) [u' ', u' '] >>> split_tex_string('.a.b.c.', r'\.') [u'.a', u'b', u'c.'] [u'', u'a', u'b', u'c', u''] >>> split_tex_string('.a.b.c.{d.}.', r'\.') [u'.a', u'b', u'c', u'{d.}.'] [u'', u'a', u'b', u'c', u'{d.}', u''] >>> split_tex_string('Matsui Fuuka') [u'Matsui', u'Fuuka'] >>> split_tex_string('{Matsui Fuuka}') Loading @@ -476,35 +540,39 @@ def split_tex_string(string, sep=None, strip=True, filter_empty=False): [u'a'] >>> split_tex_string('on a') [u'on', u'a'] >>> split_tex_string(r'Qui\~{n}onero-Candela, J.') [u'Qui\\~{n}onero-Candela,', u'J.'] """ if sep is None: # "\ " is a "control space" in TeX, # i. e. "a space that is not to be ignored" # The TeXbook, Chapter 3: Controlling TeX, p 8 sep = r'(\\ |[\s~])+' sep = BIBTEX_SPACE_RE filter_empty = True sep_re = re.compile(sep) brace_level = 0 name_start = 0 sep = re.compile(sep) result = [] string_len = len(string) pos = 0 for pos, char in enumerate(string): if char == '{': brace_level += 1 elif char == '}': brace_level -= 1 elif brace_level == 0 and pos > 0: match = sep_re.match(string[pos:]) if match: sep_len = len(match.group()) if pos + sep_len < string_len: result.append(string[name_start:pos]) name_start = pos + sep_len if name_start < string_len: result.append(string[name_start:]) word_parts = [] while True: head, brace, string = string.partition('{') if head: head_parts = list(_split(head, sep)) for word in head_parts[:-1]: result.append(''.join(word_parts + [word])) word_parts = [] word_parts.append(head_parts[-1]) if brace: word_parts.append(brace) up_to_closing_brace, string = _find_closing_brace(string) word_parts.append(up_to_closing_brace) else: break if word_parts: result.append(''.join(word_parts)) if strip: result = [part.strip() for part in result] if filter_empty: Loading
tests/parse_name_test.py +3 −2 Original line number Diff line number Diff line Loading @@ -173,8 +173,9 @@ sample_names = [ (['Johannes', 'Adam', 'Ferdinand', 'Alois', 'Josef', 'Maria', 'Marko'], ["d'Aviano", 'Pius', 'von', 'und', 'zu'], ['Liechtenstein'],[])), # sort of wrong, but BibTeX parses it like this (r'Brand\~{a}o, F', (['F'], [], ['Brand\\', '{a}o'], [])), (r'Brand\~{a}o, F', (['F'], [], [r'Brand\~{a}o'], [])), # but BibTeX parses it like this: # (r'Brand\~{a}o, F', (['F'], [], ['Brand\\', '{a}o'], [])), # incorrectly formatted name strings below Loading