Parsing A (modified) RIS File With Python

February 05, 2023 Post a Comment

I have a bunch of (modified) RIS files. The toy example looks like the following: Record #1 of 2 ID: CN-01160769 AU: Uedo N AU: Kasiser R TI: Development of an E-learning system SO

Solution 1:

you just need add a judge and break the Record #x of 2 line.

import re

class RIS:
    """ RIS file structure """
    def __init__(self, in_file=None):
        """ Initialize and parse input """
        self.records = []
        if in_file:
            self.parse(in_file)

    def parse(self, in_file):
        """ Parse input file """
        self.current_tag = None
        self.current_record = None
        prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
        lines = []
        # Eliminate blank lines
        for line in in_file:
            line = line.strip()
            if len(line) > 0:
                lines.append(line)
        for line in lines:
            if "#" in line:
                continue
            match = prog.match(line)
            if match:
                tag = match.groups()[0]
                field = match.groups()[1]
                self.process_field(tag, field)
            else:
                raise ValueError(line)

    def process_field(self, tag, field):
        """ Process RIS file field """
        if tag == "ID":
            self.current_record = {tag: field}
        elif tag == "YR":
            self.records.append(self.current_record)
            self.current_record = None
        elif tag in ["AU", "AD"]:
            if tag in self.current_record:
                self.current_record[tag].append(field)
            else:
                self.current_record[tag] = [field]
        else:
            if not tag in self.current_record:
                self.current_record[tag] = field
            else:
                error_str = "Duplicate tag: %s" % tag
                raise ValueError(error_str)

def main():
    """ Test the code """
    import pprint
    with open("test.ris", "rt") as ris_file:
        ris = RIS(ris_file)
        pp = pprint.PrettyPrinter()
        pp.pprint(ris.records)

if __name__ == "__main__":
    main()

the add code:

if "#" in line:
    continue

the output is

[{'AU': ['Uedo N', 'Kasiser R'],
  'ID': 'CN-01160769',
  'SO': 'United European Gastroenterology Journal',
  'TI': 'Development of an E-learning system'},
 {'AU': ['Krogh LQ'],
  'ID': 'CN-01070265',
  'SO': 'Resuscitation',
  'TI': 'E-learning in pediatric basic life support'}]

Python Playground

Parsing A (modified) RIS File With Python

Solution 1:

Post a Comment for "Parsing A (modified) RIS File With Python"