;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Copyright (c) Wesley H. Huang. All rights reserved ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; ; set the print-debug-level to control the amount of information that ; is printed: ; ; 0 = print no debugging information ; 1 = print summary of each epoch (for running large numbers of epochs) ; 2 = print Q after each epoch ; 3 = print each state/action pair of each each epoch ; 4 = print Q after each state/action pair (define print-debug-level 1) (load-option 'format) ; create a table "object" that provides a table of values for an ; (action, state) pair. The state can be one of 'north, 'south, ; 'east, and 'west. The state should be a number between 0 and ; (no-states - 1). This data type should provide the following ; operations: ; ; (define q (make-action-table 10)) ; a 10 state (action, state) table ; ; initially all values set to 0 ; ; (q 'get 'north 5) ; (q 'set 'north 5 0.4) ; (q 'reset) ; reset to all zeros ; (q 'reset random) ; reset to random values in [0, 1) ; (q 'print) ; ; (define n (make-action-table 10)) ; (n 'get 'north 5) ; (n 'inc 'north 5) ; (n 'no-states) ; (define (make-action-table no-states) (define the-table (map (lambda (x) (make-list 4 0.0)) (make-list no-states))) (define state-mapping '((north 0) (south 1) (east 2) (west 3))) (define (validargs? args howmany) (and (= (length args) howmany) (member (car args) '(north south east west)) (integer? (cadr args)) (>= (cadr args) 0) (< (cadr args) no-states) (if (= howmany 3) (number? (caddr args)) #t))) (define (set-ref! lst el-no val) (if (= el-no 0) (set-car! lst val) (set-ref! (cdr lst) (- el-no 1) val))) (define (table-set! action state val) (set-ref! (list-ref the-table state) (cadr (assoc action state-mapping)) val)) (define (table-ref action state) (list-ref (list-ref the-table state) (cadr (assoc action state-mapping)))) (define (print-table) (define (pth row table) (if (not (null? table)) (begin (format #t "~@5A: ~@10A ~@10A ~@10A ~@10A~%" row (dec-trunc (caar table)) (dec-trunc (cadar table)) (dec-trunc (caddar table)) (dec-trunc (cadddr (car table)))) (pth (+ row 1) (cdr table))))) (format #t "~%state ~@10A ~@10A ~@10A ~@10A~%" "north" "south" "east" "west") (pth 0 the-table)) (define (random-init-table) (set! the-table (map (lambda (x) (list (random 1.0) (random 1.0) (random 1.0) (random 1.0))) the-table)) '()) (define (table-execute-command command . args) (if (or (and (member command '(get inc)) (not (validargs? args 2))) (and (equal? command 'set) (not (validargs? args 3)))) (begin (display "Improper arguments given to table data type: ") (display args) (display "\nEither improper number of arguments, invalid action, nonintegral state number, state number out of range, or (for the set command) last argument is not a number.\n\n") (error "Invalid arguments")) ; otherwise, process the argument (cond ((equal? command 'get) (table-ref (car args) (cadr args))) ((equal? command 'set) (table-set! (car args) (cadr args) (caddr args))) ((equal? command 'inc) (table-set! (car args) (cadr args) (+ 1 (table-ref (car args) (cadr args))))) ((equal? command 'reset) (if (equal? args '(random)) (random-init-table) (begin (set! the-table (map (lambda (x) (make-list 4 0.0)) the-table)) '()))) ((equal? command 'no-states) (length the-table)) ((equal? command 'print) (print-table)) (else (display "Invalid command given to the table data type: ") (display command) (newline) (error "Invalid command"))))) table-execute-command) ; truncate to three decimal places (define (dec-trunc x) (/ (truncate (* x 1000)) 1000)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; create a world object (used internally in run-learner) to simulate ; the world. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (make-world connectivity action-probabilities) (define (execute-action action state) (if (or (not (member action '(north south east west))) (not (integer? state)) (< state 0) (>= state (length connectivity))) (begin (display "Invalid inputs given to the world: ") (display action) (display " ") (display state) (display "\nEither invalid action, nonintegral state, or state out of range.\n") (error "Invalid inputs")) (let* ((action-number (- 4 (length (member action '(north east south west))))) (dir (pick-direction action-probabilities)) (result-state (list-ref (list-ref connectivity state) (modulo (+ action-number dir) 4)))) ; (display "action number is: ") ; (display action-number) ; (display "\npicked-direction is: ") ; (display dir) ; (display "\npossible transition states are: ") ; (display (list-ref connectivity state)) result-state))) execute-action) (define (pick-direction a-probs) (define (pdh ap-list ran-num index) (if (null? ap-list) (error "Inexplicable error in pick-direction\nPerhaps it's an error in the action probabilities.") (if (< ran-num (car ap-list)) index (pdh (cdr ap-list) (- ran-num (car ap-list)) (+ index 1))))) (pdh a-probs (random 1.0) 0)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; some sample worlds ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define threefour '(;connectivity: cell to the (north east south west) ((4 1 0 0) (1 2 1 0) (5 3 2 1) (6 3 3 2) (7 4 0 4) (9 6 2 5) (6 6 6 6) (7 8 4 7) (8 9 8 7) (9 10 5 8) (10 10 10 10)) ;transition probablities (straight right back left) (0.8 0.1 0 0.1) ;terminal states ((state value) ... ) ((6 -1.0) (10 1.0)) ; reward for nonterminal states -0.04)) (define fourfour '(;connectivity: cell to the (north east south west) ((4 1 0 0) (5 2 1 0) (6 3 2 1) (7 3 3 2) ; cell 4 (8 5 0 4) (5 6 1 4) (9 7 2 5) (7 7 3 6) ; cell 8 (10 8 4 8) (12 9 6 9) ; cell 10 (10 11 8 10) (11 12 11 10) (12 13 9 11) (13 13 13 12)) ;transition probabilities (straight right back left) (0.7 0.15 0.15) ;terminal states ((state value) ...) ((5 -1.0) (7 -1.0) (9 1.0)) ;reward for nonterminal states -0.01)) (define fourfour2 '(;connectivity: cell to the (north east south west) ((4 1 0 0) (5 2 1 0) (6 3 2 1) (7 3 3 2) (8 5 0 4) (9 6 1 4) (10 7 2 5) (11 7 3 6) (12 9 4 8) (13 10 5 8) (14 11 6 9) (11 11 7 10) (12 13 8 12) (13 14 9 12) (14 14 10 13)) ;transition probablities (straight right back left) (0.7 0.15 0 0.15) ;terminal states ((state value) ...) ((4 -1.0) (6 -1.0) (9 1.0)) ; reward for nonterminal states -0.04)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; abstract a policy from your Q values (and print it) ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (print-policy q) (define (pph state) (if (< state (q 'no-states)) (let* ((dirs '(north south east west)) (vals (map (lambda (a) (q 'get a state)) dirs)) (maxval (apply max vals))) (format #t "State ~@2A: ~@5A (~A)~%" state (list-ref dirs (- 4 (length (member maxval vals)))) (dec-trunc maxval)) (pph (+ state 1))))) (display "\nPolicy:\n\nState #: action (Qmax)\n-----------------------\n") (pph 0)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; given a world and the number of epochs, this function sets up the Q ; and N tables, sets up the world, and feeds your q-learner function ; traning example and take the action returned to take the next step. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; make the world, Q, and N global so that you can look at them after ; learning (define w '()) (define w-desc '()) (define Q '()) (define N '()) (define (setup-run world) (set! w (make-world (car world) (cadr world))) (set! w-desc world) (set! Q (make-action-table (length (car world)))) (set! N (make-action-table (length (car world))))) (define (run-learner world nepochs) (setup-run world) ; put the terminal state rewards in the q map (map (lambda (state-reward) (map (lambda (action) (q 'set action (car state-reward) (cadr state-reward))) '(north south east west))) (caddr world)) (run-epochs nepochs)) (define (run-epochs nepochs) (define (find-reward state) (cadddr w-desc)) (define (do-trial state prev-state prev-action nsteps) (if (not (null? prev-state)) (n 'inc prev-action prev-state)) (let ((term-state (assoc state (caddr w-desc))) (action (q-learner state prev-state prev-action (find-reward prev-state) q n))) (if (and (not (= nsteps 0)) (= (modulo nsteps 200) 0)) (format #t "~%~A steps... " nsteps)) (if term-state (if (>= print-debug-level 1) (format #t "Reached terminal state ~A in ~A steps; reward is ~A~%" state nsteps (cadr term-state))) (begin (if (>= print-debug-level 3) (begin (display state) (display "->") (display action) (newline))) (if (>= print-debug-level 4) (q 'print)) (do-trial (w action state) state action (+ nsteps 1)))))) (define (do-epochs epoch-no nepochs) (if (<= epoch-no nepochs ) (begin (if (or (> print-debug-level 0) (= (modulo epoch-no 25) 0)) (begin (display "Epoch ") (display epoch-no) (display ": ") (if (= (modulo epoch-no 25) 0) (newline)))) (if (>= print-debug-level 3) (display "\n\n")) (do-trial 0 '() '() 0) (if (>= print-debug-level 2) (q 'print)) (do-epochs (+ epoch-no 1) nepochs)))) (newline) (do-epochs 1 nepochs) (q 'print) (print-policy q) ; then give information about error and stuff? )