Old School Parsing of a file name and path for validation

Owen · September 21, 2023, 11:59pm

  PROGRAM
                MAP
IsValidFilePath    FUNCTION(*STRING pFilePath, <*STRING DeviceName>, <*STRING PathName>, <*STRING FileName>, <*STRING ExtensionName>),BOOL
Main               PROCEDURE()
                END
  CODE
    Main()

Main      PROCEDURE()

FilePathName        STRING(260)
Device              STRING(260)
PathName            STRING(260)
FileName            STRING(260)
FileExtension       STRING(260)
Result              STRING(260)

Window WINDOW('Caption'),AT(,,269,111),GRAY,FONT('Segoe UI',9)
           PROMPT('Path Name:'),AT(7,9),USE(?PROMPT1)
           ENTRY(@s200),AT(47,6,205),USE(FilePathName)
           BUTTON('Test'),AT(221,89),USE(?Test)
           PROMPT('Device:'),AT(7,22),USE(?Device:PROMPT)
           PROMPT('Path:'),AT(7,35),USE(?PathName:PROMPT)
           STRING(@S255),AT(46,22,207,10),USE(Device)
           STRING(@S255),AT(46,35,207,10),USE(PathName)
           PROMPT('File:'),AT(7,49),USE(?File:PROMPT)
           STRING(@S255),AT(46,49,207,10),USE(FileName)
           PROMPT('Extension:'),AT(7,62),USE(?Extension:PROMPT)
           STRING(@S255),AT(46,62,207,10),USE(FileExtension)
           PROMPT('Message:'),AT(7,75),USE(?Message:PROMPT)
           STRING(@S255),AT(46,75,207,10),USE(Result)
       END

  CODE
  OPEN(Window)
  ACCEPT
     CASE FIELD()
     OF 0
         CASE EVENT()
         END
     OF ?Test
         CASE EVENT()
         OF EVENT:Accepted
             Device = ''
             PathName = ''
             FileName = ''
             FileExtension = ''
             !IF IsValidFilePath(FilePathName, Device, PathName, FileName, FileExtension) = FALSE
             IF IsValidFilePath(FilePathName) = FALSE
                 Result = 'Invalid Path'
             ELSE
                 Result = 'Path is good'
             END
             DISPLAY()
         END
     END        
  END
  CLOSE(Window)
        
IsValidFilePath      FUNCTION(*STRING pFilePath, <*STRING DeviceName>, <*STRING PathName>, <*STRING FileName>, <*STRING ExtensionName>)
ReturnValue                 BOOL
FilePath                    &STRING
CharacterPosition           LONG
UpperBound                  LONG

ParseState                  LONG
                            ITEMIZE
PARSESTATE:Init                 EQUATE
PARSESTATE:FileExtEnd           EQUATE
PARSESTATE:FileExt              EQUATE
PARSESTATE:FileExtStart         EQUATE
PARSESTATE:FileNameEnd          EQUATE
PARSESTATE:FileName             EQUATE
PARSESTATE:FileNameStart        EQUATE
PARSESTATE:PathNameEnd          EQUATE
PARSESTATE:PathSegmentEnd       EQUATE
PARSESTATE:PathSegment          EQUATE
PARSESTATE:PathSegmentStart     EQUATE
PARSESTATE:PathNameStart        EQUATE
PARSESTATE:DeviceEnd            EQUATE
PARSESTATE:Device               EQUATE
PARSESTATE:DeviceStart          EQUATE
                            END

eReservedDeviceNames        EQUATE('CON PRN AUX NUL COM0COM1COM2COM3COM4COM5COM6COM7COM8COM9LPT0LPT1LPT2LPT3LPT4LPT5LPT6LPT7LPT8LPT9')
eInvalidCharacters          EQUATE('<00H,01H,02H,03H,04H,05H,06H,07H,08H,09H,0AH,0BH,0CH,0DH,0EH,0FH,10H,11H,12H,13H,14H,15H,16H,17H,18H,19H,1AH,1BH,1CH,1DH,1EH,1FH>"*:<<>?/\|')

ReservedDeviceNames         STRING(eReservedDeviceNames)
InvalidCharacters           STRING(eInvalidCharacters)

SegmentStart                LONG
SegmentEnd                  LONG
PathNameEnd                 LONG
Character                   STRING(1)
Slash                       STRING(1)
ReturnDeviceNameYN          BOOL
ReturnPathNameYN            BOOL
ReturnFileNameYN            BOOL
ReturnExtensionNameYN       BOOL
MemCheck                    CLASS
Destruct                        PROCEDURE()
                            END

  CODE
  IF NOT OMITTED(DeviceName)
      ReturnDeviceNameYN    = TRUE
  END
  IF NOT OMITTED(PathName)
      ReturnPathNameYN      = TRUE
  END
  IF NOT OMITTED(FileName)
      ReturnFileNameYN      = TRUE
  END
  IF NOT OMITTED(ExtensionName)
      ReturnExtensionNameYN = TRUE
  END

  ReturnValue       = TRUE
  Slash             = '\'

  FilePath         &= NEW STRING(LEN(pFilePath))
  FilePath          = UPPER(LEFT(pFilePath))
  UpperBound        = LEN(CLIP(FilePath))
  CharacterPosition = UpperBound
  
  IF CLIP(FilePath) = ALL('.', UpperBound)
     ReturnValue = FALSE
  ELSE
     ! We are going to process the string in reverse.  Assume we are looking at a file extension
     ParseState        = PARSESTATE:Init
     LOOP
         IF CharacterPosition < 1
             BREAK
         END
         
         Character = FilePath[CharacterPosition]
         CASE ParseState
         OF PARSESTATE:Init
             ParseState = PARSESTATE:FileExtEnd
             CYCLE
             
         OF PARSESTATE:FileExtEnd
             SegmentEnd = CharacterPosition
             
             CASE Character
             OF '.'
                 ! Musn't have an extention.
                 IF CharacterPosition = 1
                     ! Relative Path to Local Directory
                     ParseState = PARSESTATE:PathNameEnd
                     CYCLE
                 ELSE
                     ParseState = PARSESTATE:FileNameEnd
                 END
                 
             OF '/'
             OROF '\'
                 ! No extention or filename
                 ParseState = PARSESTATE:PathNameEnd
                 CYCLE
             OF ':'
                 ParseState = PARSESTATE:DeviceEnd
                 CYCLE
             ELSE
                 ParseState = PARSESTATE:FileExt
                 CYCLE
             END
             
         OF PARSESTATE:FileExt
             CASE Character
             OF '\'
             OROF '/'
             OROF ':'
                 ! We have been processing a filename so switch parse state
                 SegmentStart = CharacterPosition +1
                 ParseState = PARSESTATE:FileNameStart
                 CYCLE
             OF '.'
                 IF CharacterPosition = 1
                     ! We have been processing a filename or folder that starts with a period.
                     !   Assume it is a file name.
                     SegmentStart = CharacterPosition
                     ParseState   = PARSESTATE:FileNameStart
                     CYCLE
                 ELSIF CharacterPosition = 2
                     CASE FilePath[CharacterPosition -1]
                     OF '\'
                     OROF '/'
                         Slash        = FilePath[CharacterPosition -1]
                         SegmentStart = CharacterPosition
                         CharacterPosition -= 1
                         ParseState   = PARSESTATE:FileNameStart
                         CYCLE
                     END
                 END
                 
                 ! Beginning of the extension
                 SegmentStart = CharacterPosition
                 ParseState   = PARSESTATE:FileExtStart
                 CYCLE
                 
             ELSE
                 IF INSTRING(Character, InvalidCharacters, 1, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
                 IF CharacterPosition = 1
                     ! We have been processing a filename so switch parse state
                     SegmentStart = CharacterPosition
                     ParseState = PARSESTATE:FileNameStart
                     CYCLE
                 END
             END
             
         OF PARSESTATE:FileExtStart
             ! FilePath[ SegmentStart : SegmentEnd ] will contain a file extension which will start with a period character
             ! Is a reserved device name being used
             IF INRANGE(SegmentEnd - SegmentStart -1, 2, 3)
                 IF INSTRING(FilePath[ SegmentStart +1 : SegmentEnd ], ReservedDeviceNames, 4, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
             END
             
             IF ReturnExtensionNameYN = TRUE
                 ExtensionName = pFilePath[ SegmentStart : SegmentEnd ]
             END
  
             ParseState = PARSESTATE:FileNameEnd
             
             
         OF PARSESTATE:FileNameEnd
             SegmentEnd = CharacterPosition
             ParseState = PARSESTATE:FileName
             CYCLE
             
         OF PARSESTATE:FileName
             CASE Character
             OF '/'
             OROF '\'
             OROF ':'
                 Slash        = Character
                 SegmentStart = CharacterPosition +1
                 ParseState   = PARSESTATE:FileNameStart
                 CYCLE
             ELSE
                 IF INSTRING(Character, InvalidCharacters, 1, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
                 IF CharacterPosition = 1
                     SegmentStart = 1
                     ParseState = PARSESTATE:FileNameStart
                     CYCLE
                 END
             END
             
             
         OF PARSESTATE:FileNameStart
             IF INRANGE(SegmentEnd - SegmentStart, 2, 3)     ! A reserved word will contain 3 or 4 characters
                 IF INSTRING(FilePath[ SegmentStart : SegmentEnd ], ReservedDeviceNames, 4, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
             END
  
             IF ReturnFileNameYN = TRUE
                 FileName = pFilePath[ SegmentStart : SegmentEnd ]
             END
  
             CASE Character
             OF ':'
                 ParseState = PARSESTATE:DeviceEnd
                 CYCLE
             OF Slash
                 ParseState = PARSESTATE:PathNameEnd
                 CYCLE
             ELSE
                 IF CharacterPosition = 1
                     BREAK
                 ELSE
                     ParseState = PARSESTATE:PathNameEnd
                     CYCLE
                 END
             END
         
         OF PARSESTATE:PathNameEnd
             PathNameEnd = CharacterPosition
             CASE Character
             OF Slash
                 IF CharacterPosition = 1
                     SegmentStart = CharacterPosition
                     ParseState = PARSESTATE:PathNameStart
                     CYCLE
                 ELSE
                     ParseState = PARSESTATE:PathSegmentEnd
                 END
             ELSE
                 ParseState = PARSESTATE:PathSegmentEnd
                 CYCLE
             END
             
             
         OF PARSESTATE:PathSegmentEnd
             CASE Character
             OF Slash
                 IF CharacterPosition = 1
                     ! UNC Path or Root
                     SegmentStart = CharacterPosition
                     ParseState   = PARSESTATE:PathNameStart
                     CYCLE
                 ELSE
                     ! Double slash is only allowed at the beginning of the string
                     ReturnValue = FALSE
                     BREAK
                 END
                 
             OF ':'
                 IF CharacterPosition = 1
                     ReturnValue = FALSE
                     BREAK
                 ELSE
                     SegmentStart = CharacterPosition +1
                     ParseState   = PARSESTATE:PathNameStart
                     CYCLE
                 END
             END
             SegmentEnd = CharacterPosition
             ParseState = PARSESTATE:PathSegment
             CYCLE
  
         OF PARSESTATE:PathSegment
             CASE Character
             OF Slash
                 SegmentStart = CharacterPosition
                 ParseState   = PARSESTATE:PathSegmentStart
                 CYCLE
             OF ':'
                 SegmentStart = CharacterPosition +1
                 ParseState   = PARSESTATE:PathSegmentStart
                 CYCLE
             ELSE
                 IF INSTRING(Character, InvalidCharacters, 1, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
                 IF CharacterPosition = 1
                     ! Probably a relative path
                     SegmentStart = 1
                     ParseState = PARSESTATE:PathSegmentStart
                     CYCLE
                 END
             END
             
         OF PARSESTATE:PathSegmentStart
             IF INRANGE(SegmentEnd - SegmentStart, 2, 3)     ! A reserved word will contain 3 or 4 characters
                 IF INSTRING(FilePath[ SegmentStart : SegmentEnd ], ReservedDeviceNames, 4, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
             END
             CASE Character
             OF Slash
                 IF CharacterPosition = 1
                     ParseState = PARSESTATE:PathNameStart
                     CYCLE
                 ELSE
                     ParseState = PARSESTATE:PathSegmentEnd
                 END
                 
             OF ':'
                 ParseState = PARSESTATE:PathNameStart
                 CYCLE
             ELSE
                 IF CharacterPosition = 1
                     ParseState = PARSESTATE:PathNameStart
                     CYCLE
                 END
             END
             
             
         OF PARSESTATE:PathNameStart
             IF ReturnPathNameYN = TRUE
                 PathName = pFilePath[ SegmentStart : PathNameEnd ]
             END
             IF CharacterPosition = 1
                 BREAK
             ELSE
                 ParseState   = PARSESTATE:DeviceEnd
                 CYCLE
             END
             
         OF PARSESTATE:DeviceEnd
             CASE Character
             OF ':'
                 IF CharacterPosition = 1
                     ReturnValue = FALSE
                     BREAK
                 ELSE
                     SegmentEnd = CharacterPosition
                     ParseState = PARSESTATE:Device
                 END
             END
                 
         OF PARSESTATE:Device
             IF INSTRING(Character, InvalidCharacters, 1, 1) <> 0
                 ReturnValue = FALSE
                 BREAK
             END
             IF CharacterPosition = 1
                 SegmentStart = CharacterPosition
                 ParseState = PARSESTATE:DeviceStart
                 CYCLE
             END
             
         OF PARSESTATE:DeviceStart
             IF INRANGE(SegmentEnd - SegmentStart, 2, 3)     ! A reserved word will contain 3 or 4 characters
                 IF INSTRING(FilePath[ SegmentStart : SegmentEnd ], ReservedDeviceNames, 4, 1) <> 0
                     ReturnValue = FALSE
                     BREAK
                 END
             END
             IF ReturnDeviceNameYN = TRUE
                 DeviceName = pFilePath[ SegmentStart : SegmentEnd ]
             END
             BREAK
         END
                 
         CharacterPosition -= 1
     END
  END
  
  DISPOSE(FilePath)
  RETURN(ReturnValue)

MemCheck.Destruct                        PROCEDURE()
  CODE
  IF NOT (FilePath &= NULL)
     MESSAGE('Programmer Error: Allocated memory not disposed of')
  END

KevinErskine · September 22, 2023, 1:56pm

Thanks for sharing your approach and always good to see the power of the clarion language.

If you have not gotten CapeSoft’s StringTheory - you really should there are so many functions that will save your hours of coding and testing time…

There are methods in StringTheory that make handling file names easier.

Placing a complete file name in a StringTheory object allows you to access the various parts of the file using the methods FileNameOnly, PathOnly and ExtensionOnly.

vitesse · September 23, 2023, 12:11am

An earlier version of this was posted over on Skype and at least three people (including me) mentioned StringTheory as an easier approach these days. I recall Bruce once saying “If you are not using StringTheory then you are doing it wrong”.

Regardless, this is a great example of a FSM (Finite State Machine). It is interesting that Owen moves the pointer at the bottom of the loop - I have always done this at the top.

Also the simple MemCheck class for checking that allocated memory has definitely been disposed is a great idea. I think I’ll definitely make use of this technique in the future. If I had seen this previously then I had obviously forgotten about it!

Owen: Do you have test data with expected true/false return values that you could post here in case anyone wants to tweak it? thanks.

Mike_Duglas · September 23, 2023, 7:48am

Let’s create separate forum about “how is StringTheory great” because it is not convinient to read about it in every thread.

CarlBarnes · September 23, 2023, 3:50pm

Clarion supports dynamic declaration like below. If you can make it work it’s nice as there is no need for the NEW or DISPOSE.

FilePath STRING( Size(pFilePath) )  !NEW like

I suggested to Bob Z being able to code a DISPOSE attribute like PS &String,DISPOSE to tell the compiler to auto-dispose PS when the procedure returns. So you NEW but did not need to DISPOSE and can use the variable in the RETURN PS

The MemCheck.Destruct() is a PITA when you are stepping through the code in the Debugger. It is disorienting to suddenly step into that code. I run into it with the Critical Procedure class. That code is far more out of sight than this example.

The technique is very useful if you need to RETURN(FilePath) i.e. a NEW’d variable. In the .DESTRUCT you do the DISPOSE and that happens after the it’s value is pushed for return.

A Path can contain periods. The only way to be sure what you have is a File and Not a Path is to see if it exists and look at the Attributes with the Win API with code like below. You can also do this with the DIRECTORY() :Attr field.

gist.github.com

https://gist.github.com/CarlTBarnes/2d89615faa064f0b8437c6a66d0f80e2

FileAttributes.clw

  MAP
FileStripReadOnly    PROCEDURE(string _filename),BOOL !True if Worked

  MODULE('win32.lib')
    SetFileAttributes(*CSTRING FileName,LONG NewFileAttribs),BOOL,PROC,PASCAL,DLL(1),RAW,NAME('SetFileAttributesA')
    GetFileAttributes(*CSTRING FileName),LONG,PASCAL,DLL(1),RAW,NAME('GetFileAttributesA')  !Returns Attribs 
  END
!--------------------------------------------------------  
FileStripReadOnly    PROCEDURE(string _filename) !,BOOL
Attrs    LONG,AUTO

This file has been truncated. show original

jslarve · September 23, 2023, 8:40pm

Luckily, that’s pretty easy to manage with a class. StringTheory is a good example. I often use it for return strings.

Or sometimes I use a dumbstring AmbleScrayLite/Clarion/JS_HexTools.inc at main · jslarve/AmbleScrayLite · GitHub

Bruce · September 23, 2023, 9:05pm

I hear your pain Mike. On the one hand people complain about the progress of clarion left and right, and on the other hand every thread seems to have a simple answer if you are using stringtheory.

Since most code, and hence most questions, involve strings in some way (its a pretty common datatype) its inevitable that a bunch of answers mention it.

It can be frustrating for those answering question after question to write inferior answers though just to avoid mentioning a tool pretty much everyone has.

Of course nothing stops you posting a stringtheory-free reply. Perhaps even one that makes use of the built-in SystemString class.

I get it, some people are time rich, and like to write a bunch of custom code. Ive got no problem with that. Others write code for money (most of the folk writing answers here) and so prefer to focus on program performance and programming efficiency.

Since this forum caters to professionals and hobbiest alike, I think both answers are useful.

Owen · September 24, 2023, 9:16am

The fact is that the code I have posted took a lot of effort to debug and get the nuances right. The code works, but it is hard to maintain.

When I get time, I will re-do it by having a separate lexical analyser from the syntax analyser. The code will become a more maintainable and more flexible.

From what I have seen with String Theory and its capabilities, yes it can split up a file path into four sections. Yes, it can be a lexical analyser and split up a large string based on delimiters. But it won’t provide syntax validation.

Syntax validation should work hand in hand with the lexical analyser but should be a layer above it. Also ideally for fast and efficient code, you would want only one pass through. So you want the lexical analyser to be checking for invalid characters as will as splitting up the string into tokens. The syntax analyser’s job is to validate each token as a whole. I don’t think you will get that with String Theory.

I’m not saying String Theory is bad. It does what it does very well, and in most cases you might not need to care.

I have worked on projects where speed and efficiency of the code has been extremely important. Some years ago I ws on a project where I built a parser which could take a script file and output a web page using a scripting language I designed. Someone decided to create a file that was way beyond what I was expecting to be used and it was taking over 5 minutes to build and output the web page. A rethink on how I was parsing the file was in order and I got it down to almost instantaneous, even under high load.

Some times you have to go old school. But, you want your code to be maintainable as well. That is what this example and following future examples in this topic are all about.

vitesse · September 24, 2023, 10:40am

At the risk of upsetting Mike again, much of StringTheory is pretty fast.

Both pretty fast to write your code using it and pretty fast when running it too.

Bruce · September 24, 2023, 2:12pm

Owen,

the code I have posted took a lot of effort to debug and get the nuances right

I hope it is obvious that im not disparaging your code Owen. Theres nothing wrong with writing code, and in many cases it can be educational to do so. Certainly its worth-while to code ginite state machines as an exercise.

I have worked on projects where speed and efficiency of the code has been extremely important.

StringTheory is optimized for performance. This comes at the cost of code readability sometimes, and makes some maintainence harder, but the primary goal is speed. We (aka Geoff) refactor things constantly to shave tiny amounts of time off things.

If you have a large test set, something that takes at least a few seconds to run, and youre intrrested in performance, please skype it to me and i can compare the two from a performance point of view.

CarlBarnes · September 24, 2023, 11:00pm

The String Theory posts have overwhelmed Owen’s topic while not discussing the code he wrote … beyond saying you can do it with ST.

I would like to flag most of the posts here referencing String Theory as Off Topic then let them be deleted by moderator vote. So what remains will focus on Owen’s code.

@owen it’s your topic so I’ll let you object and keep the ST posts? Or better… you flag the posts you think are off topic.

Edit: Or better to split the ST posts into a new Topic and preserve the content … Owen ???

vitesse · September 25, 2023, 8:16am

Hi Carl

I can’t agree with you on this occasion.

If it turns out you could write code to do the same thing using ST that was simpler to both write and maintain then isn’t that relevant?

ie. compare and contrast two different approaches.

with ST you are working at a higher level of abstraction - often specifying what to do rather than how to do it.

no-one is knocking Owen’s FSM code (which has obviously had a lot of care and attention) - just suggesting there could be easier ways.

I take your point that there has been no ST code given “beyond saying you can do it with ST”.

I was guessing it would be much quicker to write and debug using ST (as much of the nitty gritty low-level details are hidden) and would take far fewer lines of code.

So I have just knocked this up off the top of my head. I should have timed how long it took but it was much less than an hour. Maybe 30 or 40 mins. I am being called down for dinner so I am posting this quickly - it has not yet seen a compiler so there may be typos or bugs. I will test it soon (if someone has a list of filenames with true/false for valid or not then that would be helpful).

In order to knock this up I first wrote a basic “spec”. If this spec is incomplete or contains errors then so will the code obviously.

Happy to have anyone review both spec and code (but remember this was written quickly and not yet tested so please be kind ).

There are around 30 lines of ST code.

Again this is not to criticize Owen’s FSM code - merely to demonstrate an alternative approach and argue that comments about alternative approaches are quite valid and useful.

cheers for now

Geoff

===========
Specs:

check no invalid chars - note I have removed \ / and : from Owen’s list as these are valid in certain contexts
can have backslash or forward slash but not both
double slash is only allowed at the beginning of the string
device is that part up to (before) the first colon ‘:’
there can only be one colon ‘:’
file and extension are after last slash with filename being before last dot and extension after last dot

**edit: I have thought of a couple of extra things and no doubt there will be others. If people like to correct or add to the specs then I will adjust code accordingly.

**edit 2: In my rush I completely forgot about checking for Owen’s “ReservedDeviceNames” so I will look at that in the morning (my time here in Australia) and will post an amended version. Suffice to say it will end up being a little more than the 30 or so lines shown here.

code:

IsValidFilePath      FUNCTION(*STRING pFilePath, <*STRING DeviceName>, <*STRING PathName>, <*STRING FileName>, <*STRING ExtensionName>)
st    stringTheory
slash string('\')
invalidChars   string('<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31>"*<<>?|')
  code 
  st.setValue(pFilePath,st:clip)
  if st.containsA(invalidChars,,st:noClip) then return false.   ! contains bad char
  if st.containsChar(slash)
    if st.containsChar('/') then return false.  ! has BOTH \ and /
  else
    slash = '/'   ! using forward slash not back slash
  end
  if st.findChars(slash & slash) > 1 or st.count(slash & slash) > 1 then return false.  ! Double slash is only allowed at the beginning
  if st.containsChar(':')
    st.split(':')
    if st.records() > 2 then return false.  ! more than one colon
    if ~omitted(DeviceName) then DeviceName = st.getLine(1).
    st.setValue(st.getLine(2))
  end
  if omitted(PathName) and omitted(FileName) and omitted(ExtensionName) then return true.  ! all good - we are done  
  st.split(slash)
  st.setValue(st.getLine(st.records()))  ! get filename & extension
  if ~omitted(FileName)      then FileName      = st.beforeLast('.').
  if ~omitted(ExtensionName) then ExtensionName = st.afterLast('.').
  if ~omitted(PathName)
    st.deleteLine(st.records())  ! delete last line that holds filename & extension
    st.join(slash)
    PathName = st.getValue()
  end
  return true

Owen · September 25, 2023, 11:39pm

Geoff,

Why wouldn’t you use String Theory as a lexical analyser and build a high level syntax parser around it?

From my experience, most programmers don’t really know how to write a parser and resort to the type of code you have provided here. You know better than that.

One of the reasons for the original article is to teach. It is good that String Theory is written to be fast and efficient. So how would you use that efficiency to good effect at a high level?

I would argue that the approach I have taken in writing a FSM parser is the most efficient you can do it because it parses the string in one pass. What approach would you take with string theory to achieve the same thing?

Owen · September 26, 2023, 1:15am

I was probably a bit confrontational in my post and I am sorry for that.

At the end of the day though, the idea “You don’t need to do that, let us do it for you” is a good marketing approach. But it doesn’t cut when it comes to understanding techniques and why they are useful. All it does is suppress knowledge.

Some people might not care and will never encounter the need. But parsing is such a fundamental tool and has so many applications, I think there is a need to improve knowledge of it and who says I am getting right. This is where discussion of the topic at hand is important.

Bruce · September 26, 2023, 3:54am

Morning Owen,

I think it would be useful if you elucidated the point of the code.

From your comments i feel like you were demonstrating a technique. In that context rewriting the code serves no purpose.

I assumed, as did geoff i guess, from the title of the thread, that were looking to parse a filename. But even in that case it lacked a specification (hence why i offered suggestions but no alternate code.)

Can i recommend we change the name of the thread to “creating a lexical parser as a finite state machine” and removing posts that are incompatible with that title?

We collectivly also need to be more careful in future when responding to code being posted. The motivation for the post needs to be clear before responding. Some post to get feedback, some post to demonstate technique, some are posting to show ability and i think we need to be more sensitive to that before piling on.

vitesse · September 26, 2023, 10:48am

Hi Owen

thanks for your thoughts and observations. Different people have different views on “life, the universe and everything!” so we can’t be expected to agree on all matters. And yes I do know a bit about parsing and FSMs. In fact I recall having a discussion with you at one of the ConVic conferences up at Mt Buffalo chalet about the use of yacc and lex. Would have been around the turn of the century.

the only reason I wrote the alternative code using ST was due to the criticism that no-one was doing it!

But a couple of observations on your code first:

there is no way (that I can see) that you can tell what your filename is if it has trailing spaces before the ‘.’ for the extension.

eg. “test.txt” vs “text .txt”

maybe the passed optional filename parameter needs to be cstring (or - cough, cough - a ST object - yea ok “run duck and cover”).

having studied your code a bit more I see the way you have defined expected outputs differs from my code.

eg. your DeviceName includes the colon on the end, your path includes the final slash and your extension includes the dot at the start whereas mine do not.

This is not a bug in either - just a different approach. My “specs” from yesterday included:

device is that part up to (before) the first colon ‘:’
and
extension after last dot

I could have used the built-in ST methods to get file parts that Kevin mentioned above (FileNameOnly, PathOnly and ExtensionOnly) and then just split the PathOnly on the colon to get the device (if any). That would have been much less code but possibly more passes through the data. In some ways I have overcomplicated it in the name of speed.

The ST help includes this example code:

str.SetValue(‘c:\temp\strings\table.tps’)
Path = str.PathOnly()
FileNameWithExtension = str.FileNameOnly()
FilenameNoExtension = str.FileNameOnly( , false)
Extension = str.ExtensionOnly()

Anyway FWIW str.ExtensionOnly() also does not include the leading dot and str.PathOnly() does not include the last slash so I guess I was following that existing ST approach.

My “specs” from yesterday did not make allowance for all dots ‘…’ that your code allows for - I have fixed that in this new version that now also checks for your ReservedDeviceNames.

Adding the ReservedDeviceNames code on the segments added quite some complexity and I did notice what I think are possibly bugs in your code. Well by “bug” I mean it doesn’t match the results of my code or my expectations. (Hey I even compiled and did some basic testing on my code this time!)

examples

'hello.com'                   -> mine says good, yours says invalid
'c:\!\con\abc123\test  .txt'  -> mine says invalid, yours says good - segment has 'con'
'c:\!\lpt0\com1\whatever.txt' -> mine says invalid, yours says good - one segment has 'com1' another 'lpt0'

maybe my expectations are wrong? But I definitely remember in the DOS days sometimes programs had .com instead of .exe (I think it is due to a three char segment matching the first three chars of a 4 char reserved device name - but I will leave it to you to check.)

Some of my code here has been optimized for speed and therefore may be more opaque than necessary - as Bruce alluded to yesterday.

I decided to clear the return fields where the passed value is invalid - might avoid some confusion.

Re this topic - Bruce and Carl have suggested either deleting ST comments or splitting into a different thread. All I will say is having written and optimized this code I would not like it to disappear without trace. Just as Owen’s FSM code can be an educational tool, so can this ST code for anyone interested.

anyway enough for now - here is my code:

IsValidFilePathGeoff PROCEDURE  (*STRING pFilePath,<*STRING DeviceName>,<*STRING PathName>,<*STRING FileName>,<*STRING ExtensionName>) ! Declare Procedure
st           stringTheory
doubleSlash  string('\\')
slash        string(1),over(doubleSlash)
invalidChars string('<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31>"*<<>?|')
deviceNames  string('CON PRN AUX NUL COM0COM1COM2COM3COM4COM5COM6COM7COM8COM9LPT0LPT1LPT2LPT3LPT4LPT5LPT6LPT7LPT8LPT9')
seg4         string(4),auto ! four character segment string used for searching deviceNames
x            long,auto
  code
L loop 1 times ! dummy loop - break out to clear values and return false
    st.setValue(pFilePath,st:clip)
    if st.IsAll('.',,st:noClip)              then break.  ! contains only dots
    if st.containsA(invalidChars,,st:noClip) then break.  ! contains bad char
    if st.containsChar(slash)
      if st.containsChar('/')                then break.  ! has BOTH \ and /
    else
      doubleSlash = '//'                                  ! using forward slash not back slash
    end
    if st.findChars(doubleSlash) > 1 or st.count(doubleSlash) > 1 then break.  ! Double slash is only allowed at the beginning
    if st.containsChar(':')
      st.split(':')
      if st.records() > 2                    then break.  ! more than one colon 
      st.setValue(st.getLine(1))                          ! get device (before colon)
      if st._DataEnd = 3 or st._DataEnd = 4               ! check segment for reserve device name
        seg4 = upper(st.valuePtr[1:st._DataEnd])
        if instring(seg4,deviceNames,4,1)    then break.  ! contains reserved device name
      end
      if ~omitted(DeviceName)  then DeviceName = st.getValue().
      st.setValue(st.getLine(2))                          ! get remaining string after device
    end
    st.split(slash)
    st.setValue(st.getLine(st.records()))                 ! get filename & extension
    st.setLine(st.records(),st.beforeLast('.'))           ! replace filename and extension with just filename in lines queue
    st.addLine(st:end,st.afterLast('.'))                  ! add extension onto end of lines queue 
    loop x = 1 to st.records()
      st.setValue(st.getLine(x))                          ! get segment
      if st._DataEnd = 3 or st._DataEnd = 4               ! check segment for reserve device name
        seg4 = upper(st.valuePtr[1:st._DataEnd])
        if instring(seg4,deviceNames,4,1) then break L.   ! contains reserved device name
      end
    end  
    if ~omitted(FileName)      then FileName      = st.getLine(st.records()-1).
    if ~omitted(ExtensionName) then ExtensionName = st.getLine(st.records()).
    if ~omitted(PathName)
      st.deleteLine(st.records())  ! delete last line that holds extension
      st.deleteLine(st.records())  ! delete last line that now holds filename
      st.join(slash)
      PathName = st.getValue()
    end
    return true ! all good 
  end ! dummy loop
  if ~omitted(DeviceName)    then DeviceName    = ''.  ! clear return values when invalid
  if ~omitted(PathName)      then PathName      = ''.
  if ~omitted(FileName)      then FileName      = ''.
  if ~omitted(ExtensionName) then ExtensionName = ''.
  return false  ! invalid

vitesse · September 26, 2023, 10:58am

yes agree whole-heartedly. Generally the Clarion community is very friendly but it would be great to try to avoid any unnecessary flare ups.

anon77170705 · September 26, 2023, 12:31pm

What’s wrong with the directory name '..' ?

What’s wrong with UNC names?

Actually, file/directory name parts are: (optional) server, share/drive, path, and filename and extension for files.

CarlBarnes · September 26, 2023, 2:34pm

I usually don’t want to let an end user enter ‘.’ or ‘…’ into an ENTRY especially if its for a File Name that would be a Path

If I do want to allow periods I’ll add code to convert to normal file name like (from memory not tested):

IF INLIST(FName,'.','..','.\','..\') THEN 
   FName=LONGPATH(FName)   !Is path only, is that ok ?

ELSIF SUB(FName,1,2)='.\' OR SUB(FName,1,3)='..\'  THEN 
   FName=LONGPATH(FName)   !Could be path only, is that ok ?
END

IIRC something’s don’t work with UNC so I require a mapped driver letter for the EXE folder. One that comes to mind is RUN() , or maybe it was Batch file RUN('Some.BAT'). I also recall problems with a UNC IP path like \\192.168.123.123, maybe that was the main problem not simply any \\UNC\ServerName

I usually allow a \\UNC name for loading a DOS,ASCII,BASIC file, or any file.

anon77170705 · September 26, 2023, 3:31pm

Is this a reason to parse UNC names incorrectly?

RUN works with programs/documents with the names having the UNC format. Both servers’ names and IP addresses are supported at least if these servers are in the LAN.