Discussion:
TeX's line breaking in the grub sesh
(too old to reply)
Stefan Ram
2024-08-05 13:09:16 UTC
Permalink
During my grub sesh today, I crushed out TeX's line breaking
in Python in like a hot minute - 36 to be exact. Tryna use
it for plain text, like, monospaced fonts and whatnot. Natch,
I stripped it down to the bare bones, but it's still hella
tight. Already got that "parshape" action goin' on (which
Knuth-Plass can't hang with, if I'm not trippin'). Next up,
I'm finna tackle those "discretionary items" - that's gonna be
gnarly! For sure there's some janky bugs in there, but peep this:

wrap.py

source = 'Ich habe das gebackene Profi-Bettuch bereits gesehen. '

active0 =[ 0, 0, 0, 0 ] # previous, position, quality, line_number
active =[ active0 ]
parshape =[ 10, 20, 20, 20, 20, 20, 20, 20 ]

p = 1
while p < len( source ):
ch = source[ p ]
if ch == ' ':
new_active = []
best_quality = -10000
best_act = active[ 0 ]
for act in active:
a = act[ 1 ]
line_number = act[ 3 ]
target_length = parshape[ line_number ]
this_length = p - a
if this_length > target_length:
pass
else:
quality = this_length - target_length
new_active.append( act )
if quality > best_quality:
best_quality = quality
best_act = act
new_active.append( [ best_act, p, best_quality, best_act[ 3 ]+1 ] )
active = new_active
p += 1
act = active[ -1 ]
buff = []
while act[ 0 ]:
prev = act[ 0 ]
buff.append( source[ prev[1]: act[1] ])
act = prev
for line in reversed( buff ):
print(line)

output

Ich habe das
gebackene Profi-Bettuch
bereits gesehen.
Stefan Ram
2024-08-28 18:34:59 UTC
Permalink
Turns out there was still a glitch in the code! The latest
build now shows a paragraph break with (fingers crossed)
global optimization, taking parshape into account. Now that
I've finally squashed the bug, discretionary items haven't
been baked in yet. That's next on my to-do list though.

Ironically, lines of the following Python 3.12 source code have NOT
been wrapped to the 72 characters recommended for Usenet posts!

main.py

from dataclasses import dataclass
from typing import Optional, List, Iterator
import bisect

source_text = list( ' Lorem ipsum dolor sit amet, consectetur adipiscing elit, '
'sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad '
'minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea '
'commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit '
'esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat '
'non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ' )

parshape =[ 80, 80, 80, 40 ]

@dataclass
class ActiveEntry:
previous: Optional[ 'ActiveEntry' ]= None
position: int = 0 # position in the text
branch: int = 0 # for future version
sum_quality: int = 0 # the sum of all "merits" up to this point
line_number: int = 0 # line started with this (first line = 0)

parshape_length = len( parshape )

def print_up_to( this ):
'''prints the wrapped paragraph up to the point "this".
It starts at the back point "this" and then goes forward
in the text via the linked chain of points. Finally, for
printing the text in the normal order, it then goes forward
again.'''
buff = []
qual = []
sum_quality = this.sum_quality
while this.previous:
previous = this.previous
line = source_text[ previous.position: this.position ]
# print( f'{line = }' )
buff.append( line )
qual.append( this.sum_quality )
this = previous
start_position = 0
# we went backwards, but actually want to print in the normal direction
first = 1
for i,( line, qual ) in enumerate( zip( reversed( buff ), reversed( qual ))):
text = ''.join( line[ start_position: ])
target_length = parshape[ i ]if i < parshape_length else parshape[ -1 ]# dupe!
output = text
print( output[ first: ])
first = 0
start_position = 1 # skip an initial space or something
print()
print( 'Total merits:', sum_quality )
print()

active0 = ActiveEntry()

active_list =[ active0 ]

current_position = 1
source_length = len( source_text )
while current_position < len( source_text ):
ch = source_text[ current_position ]
if ch == ' ': # possible breakpoint
new_active_list = [] # next active list
best_sum_quality = None # best quality summed across this and previous lines, not yet determined
best_act = active_list[ 0 ] # preliminary choice
for active in active_list:
active_position = active.position
line_number = active.line_number
target_length = parshape[ line_number ]if line_number < parshape_length else parshape[ -1 ]# dupe!
distance = current_position - active_position
adjustment = target_length - distance
if adjustment < 0:
# "When an active breakpoint a is encountered for which
# the line from a to b has an adjustment ratio less
# than -1 (that is, when the line can't be shrunk to
# fit the desired length), breakpoint a is removed
# from the active list."
pass # do not transfer into the new active list
else:
new_active_list.append( active )
this_line_quality = -adjustment**2
have_reached_final_space = current_position == source_length - 1 # final ' ' on end of last line
if have_reached_final_space: this_line_quality = 0 # arbitrary whitespace at end is accepted
this_sum_quality = active.sum_quality + this_line_quality
if \
best_sum_quality is None or \
this_sum_quality > best_sum_quality:
best_sum_quality = this_sum_quality
best_predecessor = active
if best_sum_quality is not None:
# make a new active point from current position, linking it to the best active point found
new_active_list.append( ActiveEntry( previous=best_predecessor, position=current_position, branch=0, sum_quality=best_sum_quality, line_number=best_predecessor.line_number+1 ))
active_list = new_active_list
current_position += 1
active = active_list[ -1 ] # the final space

print_up_to( active )

output

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit
in voluptate velit esse cillum dolore
eu fugiat nulla pariatur. Excepteur
sint occaecat cupidatat non proident,
sunt in culpa qui officia deserunt
mollit anim id est laborum.

Total merits: -80

Loading...