Work at SourceForge, help us to make it a better place! We have an immediate need for a Support Technician in our San Francisco or Denver office.

Close

Pretty Print XML (And HTML)

juntalis
2011-07-17
2013-01-25
  • juntalis
    juntalis
    2011-07-17

    The XML Tools plugin for Notepad++ is awesome and does almost everything I need when I'm working with XML files. There are times, however, when I'm working with markup-oriented files that aren't quite valid, or some return from a webapp that is not only not valid, but also extremely squished and ugly to read. Unfortunately, XML Tools will only format valid markup, so I threw this together. I used a script I found at http://xmlpp.codeplex.com/ as a base, then modified it to be a bit more customizable. It automatically grabs Notepad++'s settings to figure out how to format the markup, but the variables to change the settings are at the top if of the script if you'd like to do so.

    # Settings for the Notepad++ Script
    # By default, it grabs your current notepad settings. If you'd prefer something else, however,
    # you can set them below.
    #
    # Original script: Copyright (c) 2008, Fredrik Ekholdt, All rights reserved.
    # Found at http://xmlpp.codeplex.com/
    __indent_char = '\t' if editor.getUseTabs() else ' '
    __indent_size = 1 if editor.getUseTabs() else editor.getIndent()
    __width = 100 if editor.getEdgeMode() == EDGEVISUALSTYLE.NONE else editor.getEdgeColumn()
    class xmlpp:
        import sys, re
        indent_char = ' '
    
        def __pprint_line(self, indent_level, line, width, output):
            if line.strip():
                start = ""
                number_chars = 0
                indent_width = 1
                if self.indent_char == '\t':
                    indent_width=editor.getTabWidth()
                for l in range(indent_level):
                    start += self.indent_char
                    number_chars = number_chars + 1
                try:
                    number_chars = number_chars
                    elem_start = re.findall("(\<\W{0,1}\w+:\w+) ?", line)[0]
                    elem_finished = re.findall("([?|\]\]/]*\>)", line)[0] 
                    #should not have *
                    attrs = re.findall("(\S*?\=\".*?\")", line)
                    output.write(start + elem_start)
                    number_chars = (len(start) * indent_width) + len(elem_start)
                    for attr in attrs:
                        if (attrs.index(attr) + 1) == len(attrs):
                            number_chars = number_chars + len(elem_finished)
                        if (number_chars + len(attr) + 1) > width:
                            output.write("\n")
                            for i in range(len(start + elem_start) + 1):
                                output.write(self.indent_char)
                            number_chars = (len(start) * indent_width) + len(elem_start) + 1 
                        else:
                            output.write(self.indent_char)
                            number_chars = number_chars + 1
                        output.write(attr)
                        number_chars = number_chars + len(attr)
                    output.write(elem_finished + "\n")
                except IndexError:
                    #give up pretty print this line
                    output.write(start + line + "\n")
    
        def __pprint_elem_content(self, indent_level, line, output=sys.stdout):
            if line.strip():
                for l in range(indent_level):
                    output.write(self.indent_char)
                output.write(line + "\n")
        def __get_next_elem(self,data):
            start_pos = data.find("<")
            end_pos = data.find(">") + 1
            retval = data[start_pos:end_pos]
            stopper = retval.rfind("/") 
            if stopper < retval.rfind("\""):
                stopper = -1
            single = (stopper > -1 and ((retval.find(">") - stopper) < (stopper - retval.find("<"))))
            ignore_excl = retval.find("<!") > -1
            ignore_question =  retval.find("<?") > -1
            if ignore_excl:
                cdata = retval.find("<![CDATA[") > -1
                if cdata:
                    end_pos = data.find("]]>")
                    if end_pos > -1:
                        end_pos = end_pos + len("]]>")
            elif ignore_question:
                end_pos = data.find("?>") + len("?>")
            ignore = ignore_excl or ignore_question
    
            no_indent = ignore or single
            #print retval, end_pos, start_pos, stopper > -1, no_indent
            return start_pos, \
                   end_pos, \
                   stopper > -1, \
                   no_indent
        def get_pprint(self, xml, indent=4, indentchar=' ', width=80):
            self.indent_char = indentchar
            """Returns the pretty printed xml """
            class out:
                output = ""
                def write(self, string): 
                    self.output += string
            out = out()
            self.pprint(xml, output=out, indent=indent, width=width)
    
            docFormat = notepad.getFormatType(notepad.getCurrentBufferID())
            newLine = '\r\n' if docFormat == FORMATTYPE.WIN else '\n'
            return out.output.replace('\n', newLine)
        def pprint(self,xml, output=None, indent=4, width=80):
            """Pretty print xml. 
            Use output to select output stream. Default is sys.stdout
            Use indent to select indentation level. Default is 4   """
            data = xml
            indent_level = 0
            start_pos, end_pos, is_stop, no_indent  = self.__get_next_elem(data)
            while ((start_pos > -1 and end_pos > -1)):
                self.__pprint_elem_content(indent_level, data[:start_pos].strip(), 
                                     output=output)
                data = data[start_pos:]
                if is_stop and not no_indent:
                    indent_level = indent_level - indent
                self.__pprint_line(indent_level, data[:end_pos - start_pos],
                            width=width, output=output)
                data = data[end_pos - start_pos:]
                if not is_stop and not no_indent :
                    indent_level = indent_level + indent
                if not data:
                    break
                else:
                    start_pos, end_pos, is_stop, no_indent  = self.__get_next_elem(data)
    if __name__=='__main__':
        editor.setText(
            xmlpp().get_pprint(
                editor.getText(),
                indent=__indent_size,
                indentchar=__indent_char,
                width=__width
            )
        )