From 0254014e06e821c7c2b7cf888e220d42279a807e Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 13 Apr 2018 20:01:50 +0000 Subject: [PATCH 1/6] Enable shortenIDs to recycle existing IDs This patch enables shortenIDs to remap IDs currently in use. This is very helpful to ensure that scour does not change been "optimal" and "suboptimal" choices for IDs as observed in GH#186. Closes: #186 Signed-off-by: Niels Thykier --- scour/scour.py | 81 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 7c8d695..1f8780a 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -690,19 +690,16 @@ def removeUnreferencedElements(doc, keepDefs): return num -def shortenIDs(doc, prefix, unprotectedElements=None): +def shortenIDs(doc, prefix, options): """ Shortens ID names used in the document. ID names referenced the most often are assigned the shortest ID names. - If the list unprotectedElements is provided, only IDs from this list will be shortened. Returns the number of bytes saved by shortening ID names in the document. """ num = 0 identifiedElements = findElementsWithId(doc.documentElement) - if unprotectedElements is None: - unprotectedElements = identifiedElements referencedIDs = findReferencedElements(doc.documentElement) # Make idList (list of idnames) sorted by reference count @@ -710,24 +707,28 @@ def shortenIDs(doc, prefix, unprotectedElements=None): # First check that there's actually a defining element for the current ID name. # (Cyn: I've seen documents with #id references but no element with that ID!) idList = [(len(referencedIDs[rid]), rid) for rid in referencedIDs - if rid in unprotectedElements] + if rid in identifiedElements] idList.sort(reverse=True) idList = [rid for count, rid in idList] # Add unreferenced IDs to end of idList in arbitrary order - idList.extend([rid for rid in unprotectedElements if rid not in idList]) + idList.extend([rid for rid in identifiedElements if rid not in idList]) + # Ensure we do not reuse a protected ID by accident + protectedIDs = _protectedIDs(identifiedElements, options) curIdNum = 1 for rid in idList: curId = intToID(curIdNum, prefix) - # First make sure that *this* element isn't already using - # the ID name we want to give it. + + # Skip ahead if the new ID is in use and protected. + while curId in protectedIDs: + curIdNum += 1 + curId = intToID(curIdNum, prefix) + + # Now check if we found a new ID (we can happen to choose the same + # ID. More likely on a rerun) if curId != rid: - # Then, skip ahead if the new ID is already in identifiedElement. - while curId in identifiedElements: - curIdNum += 1 - curId = intToID(curIdNum, prefix) # Then go rename it. num += renameID(doc, rid, curId, identifiedElements, referencedIDs) curIdNum += 1 @@ -755,7 +756,7 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): Changes the ID name from idFrom to idTo, on the declaring element as well as all references in the document doc. - Updates identifiedElements and referencedIDs. + Updates identifiedElements, but not referencedIDs. Does not handle the case where idTo is already the ID name of another element in doc. @@ -822,34 +823,44 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): node.setAttribute(attr, newValue) num += len(oldValue) - len(newValue) - del referencedIDs[idFrom] - referencedIDs[idTo] = referringNodes + # We deliberately leave referencedIDs alone to enable us to bulk update + # IDs where two nodes swap IDs. (GH#186) + # del referencedIDs[idFrom] + # referencedIDs[idTo] = referringNodes return num +def _protectedIDs(seenIDs, options): + """Return a list of protected IDs out of the seenIDs""" + protectedIDs = [] + if options.protect_ids_prefix or options.protect_ids_noninkscape or options.protect_ids_list: + protect_ids_prefixes = [] + protect_ids_list = [] + if options.protect_ids_list: + protect_ids_list = options.protect_ids_list.split(",") + if options.protect_ids_prefix: + protect_ids_prefixes = options.protect_ids_prefix.split(",") + for id in seenIDs: + protected = False + if options.protect_ids_noninkscape and not id[-1].isdigit(): + protected = True + elif protect_ids_list and id in protect_ids_list: + protected = True + elif protect_ids_prefixes: + if any(id.startswith(prefix) for prefix in protect_ids_prefixes): + protected = True + if protected: + protectedIDs.append(id) + return protectedIDs + + def unprotected_ids(doc, options): u"""Returns a list of unprotected IDs within the document doc.""" identifiedElements = findElementsWithId(doc.documentElement) - if not (options.protect_ids_noninkscape or - options.protect_ids_list or - options.protect_ids_prefix): - return identifiedElements - if options.protect_ids_list: - protect_ids_list = options.protect_ids_list.split(",") - if options.protect_ids_prefix: - protect_ids_prefixes = options.protect_ids_prefix.split(",") - for id in list(identifiedElements): - protected = False - if options.protect_ids_noninkscape and not id[-1].isdigit(): - protected = True - if options.protect_ids_list and id in protect_ids_list: - protected = True - if options.protect_ids_prefix: - for prefix in protect_ids_prefixes: - if id.startswith(prefix): - protected = True - if protected: + protectedIDs = _protectedIDs(identifiedElements, options) + if protectedIDs: + for id in protectedIDs: del identifiedElements[id] return identifiedElements @@ -3527,7 +3538,7 @@ def scourString(in_string, options=None): # shorten ID names as much as possible if options.shorten_ids: - _num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, unprotected_ids(doc, options)) + _num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, options) # scour lengths (including coordinates) for type in ['svg', 'image', 'rect', 'circle', 'ellipse', 'line', From 00cf42b554e11e09348218b9e66d72de178694ae Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 15 Apr 2018 16:22:00 +0000 Subject: [PATCH 2/6] Rename function to match DEP8 conventions Signed-off-by: Niels Thykier --- scour/scour.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 1f8780a..ad675e6 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -714,7 +714,7 @@ def shortenIDs(doc, prefix, options): # Add unreferenced IDs to end of idList in arbitrary order idList.extend([rid for rid in identifiedElements if rid not in idList]) # Ensure we do not reuse a protected ID by accident - protectedIDs = _protectedIDs(identifiedElements, options) + protectedIDs = protected_ids(identifiedElements, options) curIdNum = 1 @@ -831,7 +831,7 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): return num -def _protectedIDs(seenIDs, options): +def protected_ids(seenIDs, options): """Return a list of protected IDs out of the seenIDs""" protectedIDs = [] if options.protect_ids_prefix or options.protect_ids_noninkscape or options.protect_ids_list: @@ -858,7 +858,7 @@ def _protectedIDs(seenIDs, options): def unprotected_ids(doc, options): u"""Returns a list of unprotected IDs within the document doc.""" identifiedElements = findElementsWithId(doc.documentElement) - protectedIDs = _protectedIDs(identifiedElements, options) + protectedIDs = protected_ids(identifiedElements, options) if protectedIDs: for id in protectedIDs: del identifiedElements[id] From d6406a34706e7ae1422595a5eda6d8bc7b50b1a8 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 15 Apr 2018 16:29:44 +0000 Subject: [PATCH 3/6] shortenIDs: Avoid pointless renames of IDs With the current code, scour could do a pointless remap of an ID, where there is no benefit in it. Consider: ```xml ``` In this example, there is no point in swapping the IDs - even if "#b" is used more often than "#a", they have the same length. Besides a performance win on an already scour'ed image, it also mean scour will behave like a function with a fixed-point (i.e. scour eventually stops altering the image). To solve this, we no longer check whether an we find exactly the same ID. Instead, we look at the length of the new ID compared to the original. This gives us a slight complication as we can now "reserve" a "future" ID to avoid the rename. Thanks to Eduard "Ede_123" Braun for providing the test case. Signed-off-by: Niels Thykier --- scour/scour.py | 24 +++++++++++++++++++----- testscour.py | 13 +++++++++++++ unittests/shorten-ids-stable-output.svg | 11 +++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 unittests/shorten-ids-stable-output.svg diff --git a/scour/scour.py b/scour/scour.py index ad675e6..95e24b5 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -715,22 +715,36 @@ def shortenIDs(doc, prefix, options): idList.extend([rid for rid in identifiedElements if rid not in idList]) # Ensure we do not reuse a protected ID by accident protectedIDs = protected_ids(identifiedElements, options) + # IDs that we have used "ahead of time". Happens if we are about to + # rename an ID and there is no length difference between the new and + # the old ID. + consumedIDs = set() curIdNum = 1 for rid in idList: curId = intToID(curIdNum, prefix) - # Skip ahead if the new ID is in use and protected. - while curId in protectedIDs: + # Skip ahead if the new ID has already been used or is protected. + while curId in protectedIDs or curId in consumedIDs: curIdNum += 1 curId = intToID(curIdNum, prefix) - # Now check if we found a new ID (we can happen to choose the same - # ID. More likely on a rerun) - if curId != rid: + # Avoid checking the ID if it will not affect the size of the document + # (e.g. remapping "c" to "a" is not going to win us anything) + if len(curId) != len(rid): # Then go rename it. num += renameID(doc, rid, curId, identifiedElements, referencedIDs) + elif curId < rid: + # If we skip reassigning an ID because it has the same length + # (E.g. skipping "c" -> "a"), then we have to mark the "future" + # ID as taken ("c" in the example). + # The "strictly less than" in the condition is to ensure that we + # properly update curIdNum in the corner case where curId == rid. + consumedIDs.add(rid) + # Use continue here without updating curIdNum to avoid losing + # the current ID. + continue curIdNum += 1 return num diff --git a/testscour.py b/testscour.py index b52d98f..eb62af7 100755 --- a/testscour.py +++ b/testscour.py @@ -1948,6 +1948,19 @@ class ShortenIDsOption(unittest.TestCase): 'Did not update reference to shortened ID') +class ShortenIDsStableOutput(unittest.TestCase): + + def runTest(self): + doc = scourXmlFile('unittests/shorten-ids-stable-output.svg', + parse_args(['--shorten-ids'])) + use_tags = doc.getElementsByTagName('use') + hrefs_ordered = [x.getAttributeNS('http://www.w3.org/1999/xlink', 'href') + for x in use_tags] + expected = ['#a', '#b', '#b'] + self.assertEquals(hrefs_ordered, expected, + '--shorten-ids pointlessly reassigned ids') + + class MustKeepGInSwitch(unittest.TestCase): def runTest(self): diff --git a/unittests/shorten-ids-stable-output.svg b/unittests/shorten-ids-stable-output.svg new file mode 100644 index 0000000..f2df1bc --- /dev/null +++ b/unittests/shorten-ids-stable-output.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + From 91503c6d7e0882ebfff7bb52def7938a63e9864e Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 15 Apr 2018 17:04:38 +0000 Subject: [PATCH 4/6] renameID: Replace referencedIDs with referringNodes This change pushes the responsibility of updating referencedIDs to its callers where needed. The only caller of renameIDs is shortenIDs and that works perfectly fine without updating its copy of referencedIDs. In shortenIDs, we need to be able to lookup which nodes referenced the "original ID" (and not the "new ID"). While shortenIDs *could* update referencedIDs so it remained valid, it is extra complexity for no gain. As an example of this complexity, imagine if two or more IDs are "rotated" like so: Original IDs: a, bb, ccc, dddd Mapping: dddd -> ccc ccc -> bb bb -> a a -> dddd While doable within reasonable performance, we do not need to support it at the moment, so there is no reason to handle that complexity. Signed-off-by: Niels Thykier --- scour/scour.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 95e24b5..d74d3d2 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -700,6 +700,10 @@ def shortenIDs(doc, prefix, options): num = 0 identifiedElements = findElementsWithId(doc.documentElement) + # This map contains maps the (original) ID to the nodes referencing it. + # At the end of this function, it will no longer be valid and while we + # could keep it up to date, it will complicate the code for no gain + # (as we do not reuse the data structure beyond this function). referencedIDs = findReferencedElements(doc.documentElement) # Make idList (list of idnames) sorted by reference count @@ -734,7 +738,7 @@ def shortenIDs(doc, prefix, options): # (e.g. remapping "c" to "a" is not going to win us anything) if len(curId) != len(rid): # Then go rename it. - num += renameID(doc, rid, curId, identifiedElements, referencedIDs) + num += renameID(doc, rid, curId, identifiedElements, referencedIDs.get(rid)) elif curId < rid: # If we skip reassigning an ID because it has the same length # (E.g. skipping "c" -> "a"), then we have to mark the "future" @@ -765,14 +769,12 @@ def intToID(idnum, prefix): return prefix + rid -def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): +def renameID(doc, idFrom, idTo, identifiedElements, referringNodes): """ Changes the ID name from idFrom to idTo, on the declaring element - as well as all references in the document doc. + as well as all nodes in referringNodes. - Updates identifiedElements, but not referencedIDs. - Does not handle the case where idTo is already the ID name - of another element in doc. + Updates identifiedElements. Returns the number of bytes saved by this replacement. """ @@ -786,7 +788,6 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): num += len(idFrom) - len(idTo) # Update references to renamed node - referringNodes = referencedIDs.get(idFrom) if referringNodes is not None: # Look for the idFrom ID name in each of the referencing elements, @@ -837,11 +838,6 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): node.setAttribute(attr, newValue) num += len(oldValue) - len(newValue) - # We deliberately leave referencedIDs alone to enable us to bulk update - # IDs where two nodes swap IDs. (GH#186) - # del referencedIDs[idFrom] - # referencedIDs[idTo] = referringNodes - return num From e25b0dae73a1fb7fa3dbe09369d3e9aacefb359f Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sun, 15 Apr 2018 17:36:07 +0000 Subject: [PATCH 5/6] Remove a (now) unused parameter to renameID Signed-off-by: Niels Thykier --- scour/scour.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index d74d3d2..ad5c7d4 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -738,7 +738,7 @@ def shortenIDs(doc, prefix, options): # (e.g. remapping "c" to "a" is not going to win us anything) if len(curId) != len(rid): # Then go rename it. - num += renameID(doc, rid, curId, identifiedElements, referencedIDs.get(rid)) + num += renameID(rid, curId, identifiedElements, referencedIDs.get(rid)) elif curId < rid: # If we skip reassigning an ID because it has the same length # (E.g. skipping "c" -> "a"), then we have to mark the "future" @@ -769,7 +769,7 @@ def intToID(idnum, prefix): return prefix + rid -def renameID(doc, idFrom, idTo, identifiedElements, referringNodes): +def renameID(idFrom, idTo, identifiedElements, referringNodes): """ Changes the ID name from idFrom to idTo, on the declaring element as well as all nodes in referringNodes. From 039022ee9d04b8a3b9cd5e0e25223f15a6a2fbf6 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Mon, 16 Apr 2018 18:49:27 +0000 Subject: [PATCH 6/6] shortenID: Improve tracking of optimal ID lengths Signed-off-by: Niels Thykier --- scour/scour.py | 94 +++++++++++++++++++------ unittests/shorten-ids-stable-output.svg | 6 +- 2 files changed, 75 insertions(+), 25 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index ad5c7d4..1e7316c 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -719,41 +719,91 @@ def shortenIDs(doc, prefix, options): idList.extend([rid for rid in identifiedElements if rid not in idList]) # Ensure we do not reuse a protected ID by accident protectedIDs = protected_ids(identifiedElements, options) - # IDs that we have used "ahead of time". Happens if we are about to - # rename an ID and there is no length difference between the new and - # the old ID. + # IDs that have been allocated and should not be remapped. consumedIDs = set() + # List of IDs that need to be assigned a new ID. The list is ordered + # such that earlier entries will be assigned a shorter ID than those + # later in the list. IDs in this list *can* obtain an ID that is + # longer than they already are. + need_new_id = [] + + id_allocations = list(compute_id_lengths(len(idList) + 1)) + # Reverse so we can use it as a stack and still work from "shortest to + # longest" ID. + id_allocations.reverse() + + # Here we loop over all current IDs (that we /might/ want to remap) + # and group them into two. 1) The IDs that already have a perfect + # length (these are added to consumedIDs) and 2) the IDs that need + # to change length (these are appended to need_new_id). + optimal_id_length, id_use_limit = 0, 0 + for current_id in idList: + # If we are out of IDs of the current length, then move on + # to the next length + if id_use_limit < 1: + optimal_id_length, id_use_limit = id_allocations.pop() + # Reserve an ID from this length + id_use_limit -= 1 + # We check for strictly equal to optimal length because our ID + # remapping may have to assign one node a longer ID because + # another node needs a shorter ID. + if len(current_id) == optimal_id_length: + # This rid is already of optimal length - lets just keep it. + consumedIDs.add(current_id) + else: + # Needs a new (possibly longer) ID. + need_new_id.append(current_id) + curIdNum = 1 - for rid in idList: - curId = intToID(curIdNum, prefix) + for old_id in need_new_id: + new_id = intToID(curIdNum, prefix) # Skip ahead if the new ID has already been used or is protected. - while curId in protectedIDs or curId in consumedIDs: + while new_id in protectedIDs or new_id in consumedIDs: curIdNum += 1 - curId = intToID(curIdNum, prefix) + new_id = intToID(curIdNum, prefix) - # Avoid checking the ID if it will not affect the size of the document - # (e.g. remapping "c" to "a" is not going to win us anything) - if len(curId) != len(rid): - # Then go rename it. - num += renameID(rid, curId, identifiedElements, referencedIDs.get(rid)) - elif curId < rid: - # If we skip reassigning an ID because it has the same length - # (E.g. skipping "c" -> "a"), then we have to mark the "future" - # ID as taken ("c" in the example). - # The "strictly less than" in the condition is to ensure that we - # properly update curIdNum in the corner case where curId == rid. - consumedIDs.add(rid) - # Use continue here without updating curIdNum to avoid losing - # the current ID. - continue + # Now that we have found the first available ID, do the remap. + num += renameID(old_id, new_id, identifiedElements, referencedIDs.get(old_id)) curIdNum += 1 return num +def compute_id_lengths(highest): + """Compute how many IDs are available of a given size + + Example: + >>> lengths = list(compute_id_lengths(512)) + >>> lengths + [(1, 26), (2, 676)] + >>> total_limit = sum(x[1] for x in lengths) + >>> total_limit + 702 + >>> intToID(total_limit, '') + 'zz' + + Which tells us that we got 26 IDs of length 1 and up to 676 IDs of length two + if we need to allocate 512 IDs. + + :param highest: Highest ID that need to be allocated + :return: An iterator that returns tuples of (id-length, use-limit). The + use-limit applies only to the given id-length (i.e. it is excluding IDs + of shorter length). Note that the sum of the use-limit values is always + equal to or greater than the highest param. + """ + step = 26 + id_length = 0 + use_limit = 1 + while highest: + id_length += 1 + use_limit *= step + yield (id_length, use_limit) + highest = int((highest - 1) / step) + + def intToID(idnum, prefix): """ Returns the ID name for the given ID number, spreadsheet-style, i.e. from a to z, diff --git a/unittests/shorten-ids-stable-output.svg b/unittests/shorten-ids-stable-output.svg index f2df1bc..6905ec1 100644 --- a/unittests/shorten-ids-stable-output.svg +++ b/unittests/shorten-ids-stable-output.svg @@ -2,10 +2,10 @@ - + - - + +