#! /usr/bin/env -S gawk -f


BEGIN {
    # tab separator
    FS="	"
    if (ARGC < 2)
    {
	printf("Need a file to work on.\n");
	printf("File needs to use tabs, not spaces (use unexpand -a)\n")
	printf("and lines ending with lf, not cr or crlf (use sed -i)\n")
	printf("** watch out for single spaces that should be tabs **\n")
	exit
    }
};

function isunknown() {
    # a label followed by a number sb a DB
    # a # in col 2 sb a DB
    # with TEA, it will always be 1 byte per line
    # (we've already done the "," to ":" label fix)
    if (($1 =="") || (match($1, /:$/)) || (match($2, /[0-9A-F][0-9A-F]H$/)))
    {
	if (match($2, /[A-F][0-9A-F]H$/))
	    $2="0" $2;
	print $1 "\tDB\t" $2 "\t" $3
    } else
	# when everything else fails, just print it
	print $1"\t"$2"\t"$3
}

function isone() {
    # output lines with 1 byte opcodes
    print $1"\t"$2"\t"$3
}

function istwo() {
    # 2nd part of instruction is on 2nd line
    # comments on both lines will get combined
    S1=$1;
    S2=$2;
    S3=$3;
    getline;
    # fix hex numbers that start with a letter
    if (match($2, /[A-F][0-9A-F]H$/))
	$2="0" $2;
    S2=S2"\t"$2;
    sub("/", "; ", $3);
    print S1 "\t" S2 "\t" S3
    if ($3 != "" )
	print "\t\t\t\t" $3
}

function isthree() {
    # parts 2 & 3 are on next two lines
    # comments on 1st two lines will get combined
    # (numeric might have extra "H", "B", "O")
    # JMP LABEL
    # CALL 1234H
    # LXI H, LABEL
    S1=$1;
    S2=$2;
    S3=$3;
    # watch out for the "LXI" cases
    if (match(S2, "LXISP"))
	S2=substr($2,1,3)"\tSP, ";
    else if (match(S2, "LXIB"))
	S2=substr($2,1,3)"\tB, ";
    else if (match(S2, "LXID"))
	S2=substr($2,1,3)"\tD, ";
    else if (match(S2, "LXIH"))
	S2=substr($2,1,3)"\tH, ";
    else
	S2=S2 "\t"
    # get the next two lines & fix them up
    getline;
    S4=$2;
    SC=$3
    getline;
    S5=$2;
    if ( S5 != "0" )
    {
	# must be two numbers, strip the trailing letter from the last one
	# hex, octal, binary...
	if (match(S5, /H$/) || (match(S5, /O$/)) || (match(S5, /B$/)))
	    S5=substr(S5, 1, length(S5)-1);
	# check if 1st number needs a zero in front
	if (match(S5, /^[A-F]/))
	    S5="0" S5;
    } else
	S5 = ""
    sub("/", "; ", $3);
    sub("/", "; ", SC);
    # put numbers in correct order
    print S1 "\t" S2 S5 S4 "\t" S3
    if (SC != "")
	print "\t\t\t\t" SC
}

function isreg1() {
    # 1 register:   CMP B
    L=length($2);
    S1=$1
    # take care of the PSW case
    if (match($2, "PSW"))
	S2=substr($2,1,L-3)"\t"substr($2,L-2,3)
    else
	S2=substr($2,1,L-1)"\t"substr($2,L,1)
    S3=$3
    print S1"\t"S2"\t"S3
}

function isreg2() {
    # 2 register: MOV A, B
    L=length($2);
    S1=$1;
    S2=substr($2,1,L-2)"\t"substr($2,L-1,1)", "substr($2,L,1)
    S3=$3;
    print S1"\t"S2"\t"S3
}

function isregi() {
    # register immediate: MVI A, LABEL
    # comments on both will combine
    L=length($2);
    S1=$1;
    S2=substr($2,1,L-1)"\t"substr($2,L,1)
    S3=$3;
    getline;
    # fix hex numbers that start with a letter
    if (match($2, /[A-F][0-9A-F]H$/))
	$2="0" $2;
    S2=S2", "$2
    sub("/", "; ", $3);
    print S1"\t"S2"\t"S3
    if ($3 != "")
	print "\t\t\t\t" $3
}

function ispseudo() {
    # fix ORG first
    if (match($2, "*"))
    {
	# assume "*##H ##H" format, but *LABEL should still work
	sub("*", "", $2)
	sub("H ", "", $2)
	# fix hex numbers that start with a letter
	if (match($2, /[A-F][0-9A-F][0-9A-F][0-9A-F]H$/))
	    $2="0" $2;
	print $1 "\tORG\t" $2$3
    } else
    if (match($2, "DW") || match($2, "DB"))
    {
	# fix for DB and DW, all in $2
	# (space delimited, either 3 or 4 fields)
	#  also need to fix hex numbers here
	#  and figure out when DS should replace DBs
	#
	# $2: DB LABEL VALUE
	# $3: /COMMENT ->
	# LABEL: \t EQU \t VALUE \t ; COMMENT
	N=split($2, A, " ")
	if (match(A[3], /^[A-F][0-9A-F]H/))
	    A[3] = "0" A[3]
	if (N==3)
	    print A[2] "\tEQU\t" A[3] "\t" $3
	else if (N==4)
	{
	    sub("H", "", A[3])
	    print A[2] "\tEQU\t" A[3] A[4] "\t" $3
	}
    }
}

function trypseudo() {
    if (match($2, "*"))
	ispseudo()
    else {
	switch (substr($2, 1, 2)) {
	case "DB": ispseudo(); break
	case "DW": ispseudo(); break
	default: isunknown(); break
	}
    }
}

function tryreg() {
    # test for register instructions
    switch (substr($2, 1, 3)) {
    case "ADC": isreg1(); break
    case "ADD": isreg1(); break
    case "ANA": isreg1(); break
    case "CMP": isreg1(); break
    case "DAD": isreg1(); break
    case "DCR": isreg1(); break
    case "DCX": isreg1(); break
    case "INR": isreg1(); break
    case "INX": isreg1(); break
    case "LDA": isreg1(); break
    case "LXI": isthree(); break
    case "MOV": isreg2(); break
    case "MVI": isregi(); break
    case "ORA": isreg1(); break
    case "POP": isreg1(); break
    case "PUS": isreg1(); break
    case "SBB": isreg1(); break
    case "SUB": isreg1(); break
    case "STA": isreg1(); break
    case "XRA": isreg1(); break
    default: trypseudo(); break
    }
}

function trythree() {
    # test for 3 byte instructions
    switch ($2) {
    case "CALL": isthree(); break
    case "CC": isthree(); break
    case "CNC": isthree(); break
    case "CNZ": isthree(); break
    case "CM": isthree(); break
    case "CP": isthree(); break
    case "CPE": isthree(); break
    case "CPO": isthree(); break
    case "CZ": isthree(); break
    case "JC": isthree(); break
    case "JMP": isthree(); break
    case "JNC": isthree(); break
    case "JNZ": isthree(); break
    case "JM": isthree(); break
    case "JP": isthree(); break
    case "JPE": isthree(); break
    case "JPO": isthree(); break
    case "JZ": isthree(); break
    case "LDA": isthree(); break
    case "LHLD": isthree(); break
    case "STA": isthree(); break
    case "SHLD": isthree(); break
    default: tryreg(); break
    }
}

function trytwo() {
    # test for 2 byte instructions
    switch ($2) {
    case "ACI": istwo(); break
    case "ADI": istwo(); break
    case "ANI": istwo(); break
    case "CPI": istwo(); break
    case "IN":  istwo(); break
    case "ORI": istwo(); break
    case "OUT": istwo(); break
    case "SBI": istwo(); break
    case "SUI": istwo(); break
    case "XRI": istwo(); break
    default: trythree(); break
    }
}

{
    # fix comments
    if (match($0, "/"))
	sub("/", "; ", $0);

    # fix labels
    if (substr($1, length($1), 1) == ",")
    {
	sub(",", ":", $1);
	#print "|"$1"|"$2"|"
    }

    # just output line with no opcodes
    if ( NF < 2 )
	print $0
    else {
	# test for 1 byte instructions
	switch ($2) {
	case "CMA": isone(); break
	case "CMC": isone(); break
	case "DAA": isone(); break
	case "DI": isone(); break
	case "EI": isone(); break
	case "HLT": isone(); break
	case "NOP": isone(); break
	case "PCHL": isone(); break
	case "RAL": isone(); break
	case "RAR": isone(); break
	case "RC": isone(); break
	case "RET": isone(); break
	case "RIM": isone(); break
	case "RNC": isone(); break
	case "RNZ": isone(); break
	case "RLC": isone(); break
	case "RM": isone(); break
	case "RP": isone(); break
	case "RPE": isone(); break
	case "RPO": isone(); break
	case "RRC": isone(); break
	case "RZ": isone(); break
	case "SIM": isone(); break
	case "SPHL": isone(); break
	case "STC": isone(); break
	case "XCHG": isone(); break
	case "XTHL": isone(); break
	default: trytwo(); break
	}
    }
}

END {
    # tea treats "end" as a lable, so
    print "\tEND"
}